Source code for inspire_schemas.utils

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE-SCHEMAS.
# Copyright (C) 2016, 2017 CERN.
#
# INSPIRE-SCHEMAS is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# INSPIRE-SCHEMAS is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE-SCHEMAS; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Public api for methods and functions to handle/verify the jsonschemas."""

import copy
import json
import os
import re

import six
from inspire_utils.date import PartialDate
from jsonschema import validate as jsonschema_validate
from jsonschema import RefResolver, draft4_format_checker
from pkg_resources import resource_filename
from unidecode import unidecode

from six.moves.urllib.parse import urlsplit

from .errors import SchemaKeyNotFound, SchemaNotFound

_schema_root_path = os.path.abspath(resource_filename(__name__, 'records'))
_resolved_schema_root_path = os.path.abspath(resource_filename(__name__, 'resolved_records'))

_RE_2_CHARS = re.compile(r'[a-z].*[a-z]', re.IGNORECASE)
_RE_CHAR = re.compile(r'[a-z]', re.IGNORECASE)
_RE_AND = re.compile(r'\band\b', re.IGNORECASE)
_RE_COLLABORATION_LEADING = re.compile(
    r'^\s*(\b(for|on behalf of|representing)\b)?\s*(\bthe\b)?', re.IGNORECASE
)
_RE_COLLABORATION_TRAILING = re.compile(
    r'\bcollaborations?\s*$', re.IGNORECASE
)
_RE_LICENSE_URL = re.compile(
    r'^/licenses/(?P<sublicense>[-\w]*)(?:/(?P<version>[\.\d]*))?'
)
_RE_VOLUME_STARTS_WITH_A_LETTER = re.compile(
    r'^(?P<letter>[A-Z])(?P<volume>\d+)', re.IGNORECASE
)
_RE_VOLUME_ENDS_WITH_A_LETTER = re.compile(
    r'(?P<volume>\d+)(?P<letter>[A-Z])$', re.IGNORECASE
)
_RE_TITLE_ENDS_WITH_A_LETTER = re.compile(
    r'(?P<title>.+(\.| ))(?P<letter>[A-Z])$', re.IGNORECASE
)


# list produced from https://arxiv.org/archive/
_NEW_CATEGORIES = {
    'acc-phys': 'physics.acc-ph',
    'adap-org': 'nlin.AO',
    'alg-geom': 'math.AG',
    'ao-sci': 'physics.ao-ph',
    'atom-ph': 'physics.atom-ph',
    'bayes-an': 'physics.data-an',
    'chao-dyn': 'nlin.CD',
    'chem-ph': 'physics.chem-ph',
    'cmp-lg': 'cs.CL',
    'comp-gas': 'nlin.CG',
    'dg-ga': 'math.DG',
    'funct-an': 'math.FA',
    'mtrl-th': 'cont-mat.mtrl-sci',
    'patt-sol': 'nlin.PS',
    'plasm-ph': 'physics.plasm-ph',
    'q-alg': 'math.QA',
    'solv-int': 'nlin.SI',
    'supr-con': 'cond-mat.supr-con',
}

ARXIV_TO_INSPIRE_CATEGORY_MAPPING = {
    'astro-ph': 'Astrophysics',
    'astro-ph.CO': 'Astrophysics',
    'astro-ph.EP': 'Astrophysics',
    'astro-ph.GA': 'Astrophysics',
    'astro-ph.HE': 'Astrophysics',
    'astro-ph.IM': 'Instrumentation',
    'astro-ph.SR': 'Astrophysics',
    'cond-mat': 'General Physics',
    'cond-mat.dis-nn': 'General Physics',
    'cond-mat.mes-hall': 'General Physics',
    'cond-mat.mtrl-sci': 'General Physics',
    'cond-mat.other': 'General Physics',
    'cond-mat.quant-gas': 'General Physics',
    'cond-mat.soft': 'General Physics',
    'cond-mat.stat-mech': 'General Physics',
    'cond-mat.str-el': 'General Physics',
    'cond-mat.supr-con': 'General Physics',
    'cs': 'Computing',
    'cs.AI': 'Computing',
    'cs.AR': 'Computing',
    'cs.CC': 'Computing',
    'cs.CE': 'Computing',
    'cs.CG': 'Computing',
    'cs.CL': 'Computing',
    'cs.CR': 'Computing',
    'cs.CV': 'Computing',
    'cs.CY': 'Computing',
    'cs.DB': 'Computing',
    'cs.DC': 'Computing',
    'cs.DL': 'Computing',
    'cs.DM': 'Computing',
    'cs.DS': 'Computing',
    'cs.ET': 'Computing',
    'cs.FL': 'Computing',
    'cs.GL': 'Computing',
    'cs.GR': 'Computing',
    'cs.GT': 'Computing',
    'cs.HC': 'Computing',
    'cs.IR': 'Computing',
    'cs.IT': 'Computing',
    'cs.LG': 'Computing',
    'cs.LO': 'Computing',
    'cs.MA': 'Computing',
    'cs.MM': 'Computing',
    'cs.MS': 'Computing',
    'cs.NA': 'Computing',
    'cs.NE': 'Computing',
    'cs.NI': 'Computing',
    'cs.OH': 'Computing',
    'cs.OS': 'Computing',
    'cs.PF': 'Computing',
    'cs.PL': 'Computing',
    'cs.RO': 'Computing',
    'cs.SC': 'Computing',
    'cs.SD': 'Computing',
    'cs.SE': 'Computing',
    'cs.SI': 'Computing',
    'cs.SY': 'Computing',
    'gr-qc': 'Gravitation and Cosmology',
    'hep-ex': 'Experiment-HEP',
    'hep-lat': 'Lattice',
    'hep-ph': 'Phenomenology-HEP',
    'hep-th': 'Theory-HEP',
    'math': 'Math and Math Physics',
    'math-ph': 'Math and Math Physics',
    'math.AC': 'Math and Math Physics',
    'math.AG': 'Math and Math Physics',
    'math.AP': 'Math and Math Physics',
    'math.AT': 'Math and Math Physics',
    'math.CA': 'Math and Math Physics',
    'math.CO': 'Math and Math Physics',
    'math.CT': 'Math and Math Physics',
    'math.CV': 'Math and Math Physics',
    'math.DG': 'Math and Math Physics',
    'math.DS': 'Math and Math Physics',
    'math.FA': 'Math and Math Physics',
    'math.GM': 'Math and Math Physics',
    'math.GN': 'Math and Math Physics',
    'math.GR': 'Math and Math Physics',
    'math.GT': 'Math and Math Physics',
    'math.HO': 'Math and Math Physics',
    'math.IT': 'Math and Math Physics',
    'math.KT': 'Math and Math Physics',
    'math.LO': 'Math and Math Physics',
    'math.MG': 'Math and Math Physics',
    'math.MP': 'Math and Math Physics',
    'math.NA': 'Math and Math Physics',
    'math.NT': 'Math and Math Physics',
    'math.OA': 'Math and Math Physics',
    'math.OC': 'Math and Math Physics',
    'math.PR': 'Math and Math Physics',
    'math.QA': 'Math and Math Physics',
    'math.RA': 'Math and Math Physics',
    'math.RT': 'Math and Math Physics',
    'math.SG': 'Math and Math Physics',
    'math.SP': 'Math and Math Physics',
    'math.ST': 'Math and Math Physics',
    'nlin': 'General Physics',
    'nlin.AO': 'General Physics',
    'nlin.CD': 'General Physics',
    'nlin.CG': 'General Physics',
    'nlin.PS': 'Math and Math Physics',
    'nlin.SI': 'Math and Math Physics',
    'nucl-ex': 'Experiment-Nucl',
    'nucl-th': 'Theory-Nucl',
    'physics': 'General Physics',
    'physics.acc-ph': 'Accelerators',
    'physics.ao-ph': 'General Physics',
    'physics.atm-clus': 'General Physics',
    'physics.atom-ph': 'General Physics',
    'physics.bio-ph': 'Other',
    'physics.chem-ph': 'Other',
    'physics.class-ph': 'General Physics',
    'physics.comp-ph': 'Computing',
    'physics.data-an': 'Data Analysis and Statistics',
    'physics.ed-ph': 'Other',
    'physics.flu-dyn': 'General Physics',
    'physics.gen-ph': 'General Physics',
    'physics.geo-ph': 'General Physics',
    'physics.hist-ph': 'Other',
    'physics.ins-det': 'Instrumentation',
    'physics.med-ph': 'Other',
    'physics.optics': 'General Physics',
    'physics.plasm-ph': 'General Physics',
    'physics.pop-ph': 'Other',
    'physics.soc-ph': 'Other',
    'physics.space-ph': 'Astrophysics',
    'quant-ph': 'General Physics',
}

_JOURNALS_ALREADY_ENDING_WITH_A_LETTER = {
    'Acta Cryst.A',
    'Acta Cryst.B',
    'Acta Cryst.D',
    'Acta Cryst.F',
    'Adv.Phys.X',
    'Annales Soc.Sci.Bruxelles A',
    'Appl.Catal.A',
    'Appl.Sci.Res.,Sect.A',
    'Bull.Okayama Univ.Sci.A',
    'Can.J.Res.A',
    'Cesk.Cas.Fys.A',
    'Chin.Ann.Math.B',
    'Colloids Surf.A',
    'Commun.Dublin Inst.Ser.A',
    'Concepts Magn.Reson.Part A',
    'Concepts Magn.Reson.Part B',
    'Global J.Sci.Front.Res.A',
    'ITB J.Sci.A',
    'Indian J.Phys.A',
    'Indian J.Phys.B',
    'Indian J.Statist.A',
    'Iran.J.Sci.Technol.A',
    'J.Chromatogr.A',
    'J.Mol.Catal.A',
    'J.Opt.A',
    'J.Opt.B',
    'J.Polymer Sci.B',
    'J.Res.Natl.Bur.Stand.A',
    'J.Res.Natl.Bur.Stand.B',
    'Kumamoto J.Sci.Ser.A',
    'NATO Sci.Peace Secur.B',
    'NATO Sci.Ser.B',
    'NATO Sci.Ser.C',
    'NATO Sci.Ser.F',
    'Nucl.Data Sheets A',
    'Nucl.Data Sheets B',
    'Nucl.Sci.Appl.A',
    'Phil.Trans.Roy.Soc.Lond.B',
    'Polymer Sci.B',
    'Proc.Rom.Acad.A',
    'Rev.Univ.Nac.Tucuman, Ser.A',
    'Sci.Rep.Nat Tsing Hua Univ.Ser.A',
    'Spectrochim.Acta A',
    'Tellus A',
    'Trans.Int.Astron.Union A',
}

_JOURNALS_THAT_NEED_A_HIDDEN_PUBNOTE = {
    'Phys.Lett.B': set(str(el) for el in range(24, 171)),
}

_JOURNALS_RENAMED_OLD_TO_NEW = {
    'Ann.Inst.H.Poincare Anal.Non Lineaire': 'Ann.Inst.H.Poincare C Anal.Non Lineaire',
    'Annales Soc.Sci.Brux.Ser.I Sci.Math.Astron.Phys.': 'Annales Soc.Sci.Bruxelles.I',
    'Annales Soc.Sci.Bruxelles Ser.B Sci.Phys.Nat.': 'Annales Soc.Sci.Bruxelles B',
    'Diss.Abstr.Int.': 'Diss.Abstr.Int.B',
    'J.Comb.Theory Ser.': 'J.Comb.Theor.A',
    'J.Vac.Sci.Technol.A Vac.Surf.Films': 'J.Vac.Sci.Technol.A',
    'J.Vac.Sci.Technol.B Microelectron.Nanometer Struct.': 'J.Vac.Sci.Technol.B',
    'Nucl.Phys.Proc.Suppl.': 'Nucl.Phys.B Proc.Suppl.',
    'Proc.Roy.Irish Acad.(Sect.A)': 'Proc.Roy.Irish Acad.A',
    'Proc.Roy.Soc.Lond.': 'Proc.Roy.Soc.Lond.A',
    'Univ.Politech.Bucharest Sci.Bull.': 'Univ.Politech.Bucharest Sci.Bull.A',
}
_JOURNALS_RENAMED_NEW_TO_OLD = {v: k for (k, v) in six.iteritems(_JOURNALS_RENAMED_OLD_TO_NEW)}

_JOURNALS_WITH_YEAR_ADDED_TO_VOLUME = {
    'JHEP',
    'JCAP',
}


[docs]def normalize_arxiv_category(category): """Normalize arXiv category to be schema compliant. This properly capitalizes the category and replaces the dash by a dot if needed. If the category is obsolete, it also gets converted it to its current equivalent. Example: >>> from inspire_schemas.utils import normalize_arxiv_category >>> normalize_arxiv_category('funct-an') u'math.FA' """ category = _NEW_CATEGORIES.get(category.lower(), category) for valid_category in valid_arxiv_categories(): if (category.lower() == valid_category.lower() or category.lower().replace('-', '.') == valid_category.lower()): return valid_category return category # XXX: will fail validation and be logged
[docs]def valid_arxiv_categories(): """List of all arXiv categories that ever existed. Example: >>> from inspire_schemas.utils import valid_arxiv_categories >>> 'funct-an' in valid_arxiv_categories() True """ schema = load_schema('elements/arxiv_categories') categories = schema['enum'] categories.extend(_NEW_CATEGORIES.keys()) return categories
[docs]def classify_field(value): """Normalize ``value`` to an Inspire category. Args: value(str): an Inspire category to properly case, or an arXiv category to translate to the corresponding Inspire category. Returns: str: ``None`` if ``value`` is not a non-empty string, otherwise the corresponding Inspire category. """ if not (isinstance(value, six.string_types) and value): return schema = load_schema('elements/inspire_field') inspire_categories = schema['properties']['term']['enum'] for inspire_category in inspire_categories: if value.upper() == inspire_category.upper(): return inspire_category category = normalize_arxiv_category(value) return ARXIV_TO_INSPIRE_CATEGORY_MAPPING.get(category, 'Other')
[docs]def split_page_artid(page_artid): """Split page_artid into page_start/end and artid.""" page_start = None page_end = None artid = None if not page_artid: return None, None, None # normalize unicode dashes page_artid = unidecode(six.text_type(page_artid)) if '-' in page_artid: # if it has a dash it's a page range page_range = page_artid.replace('--', '-').split('-') if len(page_range) == 2: page_start, page_end = page_range else: artid = page_artid elif _RE_2_CHARS.search(page_artid): # if it has 2 or more letters it's an article ID artid = page_artid elif len(_RE_CHAR.sub('', page_artid)) >= 5: # if there are more than 5 digits it's an article ID artid = page_artid else: if artid is None: artid = page_artid if page_start is None: page_start = page_artid return page_start, page_end, artid
[docs]def split_pubnote(pubnote_str): """Split pubnote into journal information.""" pubnote = {} parts = pubnote_str.split(',') if len(parts) > 2: pubnote['journal_title'] = parts[0] pubnote['journal_volume'] = parts[1] pubnote['page_start'], pubnote['page_end'], pubnote['artid'] = split_page_artid(parts[2]) return {key: val for (key, val) in six.iteritems(pubnote) if val is not None}
[docs]def build_pubnote(title, volume, page_start, page_end, artid): """Build pubnote string from parts (reverse of split_pubnote).""" pubnote = None if title and volume: pubnote = '{},{}'.format(title, volume) if page_start and page_end: pubnote += ',{}-{}'.format(page_start, page_end) elif page_start: pubnote += ',{}'.format(page_start) if artid and artid != page_start: pubnote += ',{}'.format(artid) return pubnote
[docs]class LocalRefResolver(RefResolver): """Simple resolver to handle non-uri relative paths."""
[docs] def resolve_remote(self, uri): """Resolve a uri or relative path to a schema.""" try: return super(LocalRefResolver, self).resolve_remote(uri) except ValueError: return super(LocalRefResolver, self).resolve_remote( 'file://' + get_schema_path(uri.rsplit('.json', 1)[0]) )
[docs]def get_schema_path(schema, resolved=False): """Retrieve the installed path for the given schema. Args: schema(str): relative or absolute url of the schema to validate, for example, 'records/authors.json' or 'jobs.json', or just the name of the schema, like 'jobs'. resolved(bool): if True, the returned path points to a fully resolved schema, that is to the schema with all `$ref` replaced by their targets. Returns: str: path to the given schema name. Raises: SchemaNotFound: if no schema could be found. """ def _strip_first_path_elem(path): """Pass doctests. Strip the first element of the given path, returning an empty string if there are no more elements. For example, 'something/other' will end up as 'other', but passing then 'other' will return '' """ stripped_path = path.split(os.path.sep, 1)[1:] return ''.join(stripped_path) def _schema_to_normalized_path(schema): """Pass doctests. Extracts the path from the url, makes sure to get rid of any '..' in the path and adds the json extension if not there. """ path = os.path.normpath(os.path.sep + urlsplit(schema).path) if path.startswith(os.path.sep): path = path[1:] if not path.endswith('.json'): path += '.json' return path path = _schema_to_normalized_path(schema) while path: if resolved: schema_path = os.path.abspath(os.path.join(_resolved_schema_root_path, path)) else: schema_path = os.path.abspath(os.path.join(_schema_root_path, path)) if os.path.exists(schema_path): return os.path.abspath(schema_path) path = _strip_first_path_elem(path) raise SchemaNotFound(schema=schema)
[docs]def load_schema(schema_name, resolved=False): """Load the given schema from wherever it's installed. Args: schema_name(str): Name of the schema to load, for example 'authors'. resolved(bool): If True will return the resolved schema, that is with all the $refs replaced by their targets. Returns: dict: the schema with the given name. """ schema_data = '' with open(get_schema_path(schema_name, resolved)) as schema_fd: schema_data = json.loads(schema_fd.read()) if '$schema' not in schema_data: schema_data = {'$schema': schema_data} return schema_data
inspire_format_checker = draft4_format_checker inspire_format_checker.checks('date', raises=ValueError)(PartialDate.loads)
[docs]def validate(data, schema=None): """Validate the given dictionary against the given schema. Args: data (dict): record to validate. schema (Union[dict, str]): schema to validate against. If it is a string, it is intepreted as the name of the schema to load (e.g. ``authors`` or ``jobs``). If it is ``None``, the schema is taken from ``data['$schema']``. If it is a dictionary, it is used directly. Raises: SchemaNotFound: if the given schema was not found. SchemaKeyNotFound: if ``schema`` is ``None`` and no ``$schema`` key was found in ``data``. jsonschema.SchemaError: if the schema is invalid. jsonschema.ValidationError: if the data is invalid. """ if schema is None: if '$schema' not in data: raise SchemaKeyNotFound(data=data) schema = data['$schema'] if isinstance(schema, six.string_types): schema = load_schema(schema_name=schema) return jsonschema_validate( instance=data, schema=schema, resolver=LocalRefResolver.from_schema(schema), format_checker=inspire_format_checker, )
[docs]def normalize_collaboration(collaboration): """Normalize collaboration string. Args: collaboration: a string containing collaboration(s) or None Returns: list: List of extracted and normalized collaborations Examples: >>> from inspire_schemas.utils import normalize_collaboration >>> normalize_collaboration('for the CMS and ATLAS Collaborations') ['CMS', 'ATLAS'] """ if not collaboration: return [] collaborations = _RE_AND.split(collaboration) collaborations = (_RE_COLLABORATION_LEADING.sub('', collab) for collab in collaborations) collaborations = (_RE_COLLABORATION_TRAILING.sub('', collab) for collab in collaborations) return [collab.strip() for collab in collaborations]
[docs]def get_license_from_url(url): """Get the license abbreviation from an URL. Args: url(str): canonical url of the license. Returns: str: the corresponding license abbreviation. Raises: ValueError: when the url is not recognized """ if not url: return split_url = urlsplit(url, scheme='http') if split_url.netloc.lower() == 'creativecommons.org': license = ['CC'] match = _RE_LICENSE_URL.match(split_url.path) license.extend(part.upper() for part in match.groups() if part) elif split_url.netloc == 'arxiv.org': license = ['arXiv'] match = _RE_LICENSE_URL.match(split_url.path) license.extend(part for part in match.groups() if part) else: raise ValueError('Unknown license URL') return u' '.join(license)
[docs]def convert_old_publication_info_to_new(publication_infos): """Convert a ``publication_info`` value from the old format to the new. On Legacy different series of the same journal were modeled by adding the letter part of the name to the journal volume. For example, a paper published in Physical Review D contained:: { 'publication_info': [ { 'journal_title': 'Phys.Rev.', 'journal_volume': 'D43', }, ], } On Labs we instead represent each series with a different journal record. As a consequence, the above example becomes:: { 'publication_info': [ { 'journal_title': 'Phys.Rev.D', 'journal_volume': '43', }, ], } This function handles this translation from the old format to the new. Please also see the tests for various edge cases that this function also handles. Args: publication_infos: a ``publication_info`` in the old format. Returns: list(dict): a ``publication_info`` in the new format. """ result = [] hidden_publication_infos = [] for publication_info in publication_infos: _publication_info = copy.deepcopy(publication_info) journal_title = _publication_info.get('journal_title') try: journal_title = _JOURNALS_RENAMED_OLD_TO_NEW[journal_title] _publication_info['journal_title'] = journal_title result.append(_publication_info) continue except KeyError: pass journal_volume = _publication_info.get('journal_volume') if journal_title in _JOURNALS_WITH_YEAR_ADDED_TO_VOLUME and journal_volume and len(journal_volume) == 4: _publication_info['journal_volume'] = journal_volume[2:] result.append(_publication_info) continue if journal_title and journal_volume: volume_starts_with_a_letter = _RE_VOLUME_STARTS_WITH_A_LETTER.match(journal_volume) volume_ends_with_a_letter = _RE_VOLUME_ENDS_WITH_A_LETTER.match(journal_volume) match = volume_starts_with_a_letter or volume_ends_with_a_letter if match: _publication_info.pop('journal_record', None) _publication_info['journal_title'] = ''.join([ journal_title, '' if journal_title.endswith('.') else ' ', match.group('letter'), ]) _publication_info['journal_volume'] = match.group('volume') hidden = _publication_info.pop('hidden', None) if hidden: hidden_publication_infos.append(_publication_info) else: result.append(_publication_info) for publication_info in hidden_publication_infos: if publication_info not in result: publication_info['hidden'] = True result.append(publication_info) return result
[docs]def convert_new_publication_info_to_old(publication_infos): """Convert back a ``publication_info`` value from the new format to the old. Does the inverse transformation of :func:`convert_old_publication_info_to_new`, to be used whenever we are sending back records from Labs to Legacy. Args: publication_infos: a ``publication_info`` in the new format. Returns: list(dict): a ``publication_info`` in the old format. """ def _needs_a_hidden_pubnote(journal_title, journal_volume): return ( journal_title in _JOURNALS_THAT_NEED_A_HIDDEN_PUBNOTE and journal_volume in _JOURNALS_THAT_NEED_A_HIDDEN_PUBNOTE[journal_title] ) result = [] for publication_info in publication_infos: _publication_info = copy.deepcopy(publication_info) journal_title = _publication_info.get('journal_title') try: journal_title = _JOURNALS_RENAMED_NEW_TO_OLD[journal_title] _publication_info['journal_title'] = journal_title result.append(_publication_info) continue except KeyError: pass journal_volume = _publication_info.get('journal_volume') year = _publication_info.get('year') if (journal_title in _JOURNALS_WITH_YEAR_ADDED_TO_VOLUME and year and journal_volume and len(journal_volume) == 2): two_digit_year = str(year)[2:] _publication_info['journal_volume'] = ''.join([two_digit_year, journal_volume]) result.append(_publication_info) continue if journal_title and journal_volume: match = _RE_TITLE_ENDS_WITH_A_LETTER.match(journal_title) if match and _needs_a_hidden_pubnote(journal_title, journal_volume): _publication_info['journal_title'] = match.group('title') _publication_info['journal_volume'] = journal_volume + match.group('letter') result.append(_publication_info) _publication_info = copy.deepcopy(publication_info) _publication_info['hidden'] = True _publication_info['journal_title'] = match.group('title') _publication_info['journal_volume'] = match.group('letter') + journal_volume elif match and journal_title not in _JOURNALS_ALREADY_ENDING_WITH_A_LETTER: _publication_info['journal_title'] = match.group('title') _publication_info['journal_volume'] = match.group('letter') + journal_volume result.append(_publication_info) return result