Source code for bibmanager.latex_manager.latex_manager

# Copyright (c) 2018-2026 Patricio Cubillos.
# bibmanager is open-source software under the MIT license (see LICENSE).

__all__ = [
    'get_bibfile',
    'no_comments',
    'citations',
    'parse_subtex_files',
    'build_bib',
    'update_keys',
    'clear_latex',
    'compile_latex',
    'compile_pdflatex',
]

import datetime
import os
import re
import shutil
import subprocess
import numpy as np

from .. import bib_manager as bm
from .. import config_manager as cm
from .. import utils as u


class Replacer():
    """
    Object to keep track of comments and key changes in a .tex file
    Used in update_keys() function.
    """
    def __init__(self, reps):
        self.index = 0
        self.reps = reps
        self.comments = {}

    def mask_comments(self, text):
        self.index += 1
        mask = f'BIBM_COMMENT_{self.index:06d}_'
        self.comments[mask] = text.group()
        return mask

    def recover_comments(self, text):
        for key, comment in self.comments.items():
            text = text.replace(key, comment, 1)
        return text

    def replace(self, text):
        refs = text.split(',')
        citations = []
        for ref in refs:
            old_ref = ref.strip()
            if old_ref in self.reps.keys():
                new_ref = self.reps[old_ref]
                ref = ref.replace(ref.strip(), new_ref)
            citations.append(ref)
        return ','.join(citations)



[docs]
def get_bibfile(texfile):
    r"""
    Find and extract the bibfile used by a .tex file.
    This is done by looking for a '\bibliography{}' call.

    Parameters
    ----------
    texfile: String
        Name of an input tex file.

    Returns
    -------
    bibfile: String
        bib file referenced in texfile.
    """
    with open(texfile, 'r', encoding='utf-8') as f:
        text = f.read()

    # Start at the beginning:
    beginning = text.find(r'\begin{document}')
    if beginning > 0:
        text = text[beginning:]

    # Remove comments, go through recursive .tex files referenced in text:
    text = parse_subtex_files(text)

    # Extract bibfile name from texfile:
    biblio = re.findall(r"\\bibliography{([^}]+)", text)
    if len(biblio) == 0:
        raise ValueError("No 'bibiliography' call found in tex file")
    # Ensure explicit file extension in bibfile:
    bibfile, extension = os.path.splitext(biblio[0].strip())
    bibfile += '.bib'
    return bibfile




[docs]
def no_comments(text):
    r"""
    Remove comments from tex file, partially inspired by this:
    https://stackoverflow.com/questions/2319019

    Parameters
    ----------
    text: String
        Content from a latex file.

    Returns
    -------
    no_comments_text: String
        Input text with removed comments (as defined by latex format).

    Examples
    --------
    >>> import bibmanager.latex_manager as lm
    >>> text = r'''
    Hello, this is dog.
    % This is a comment line.
    This line ends with a comment. % A comment
    However, this is a percentage \%, not a comment.
    OK, bye.'''
    >>> print(lm.no_comments(text))
    Hello, this is dog.
    This line ends with a comment.
    However, this is a percentage \%, not a comment.
    OK, bye.
    """
    return re.sub(r"\A%.*|[^\\]%.*", "", text)




[docs]
def citations(text):
    r"""
    Generator to find citations in a tex text.  Partially inspired
    by this: https://stackoverflow.com/questions/29976397

    Notes
    -----
    Act recursively in case there are references inside the square
    brackets of the cite call.  Only failing case I can think so far
    is if there are nested square brackets.

    Parameters
    ----------
    text: String
        String where to search for the latex citations.

    Yields
    ------
    citation: String
        The citation key.

    Examples
    --------
    >>> import bibmanager.latex_manager as lm
    >>> import os
    >>> # Syntax matches any of these calls:
    >>> tex = r'''
    \citep{AuthorA}.
    \citep[pre]{AuthorB}.
    \citep[pre][post]{AuthorC}.
    \citep [pre] [post] {AuthorD}.
    \citep[{\pre},][post]{AuthorE, AuthorF}.
    \citep[pre][post]{AuthorG} and \citep[pre][post]{AuthorH}.
    \citep{
     AuthorI}.
    \citep
    [][]{AuthorJ}.
    \citep[pre
     ][post] {AuthorK, AuthorL}
    \citep[see also \citealp{AuthorM}][]{AuthorN}'''
    >>> for citation in lm.citations(tex):
    >>>     print(citation, end=" ")
    AuthorA AuthorB AuthorC AuthorD AuthorE AuthorF AuthorG AuthorH AuthorI AuthorJ AuthorK AuthorL AuthorM AuthorN

    >>> # Match all of these cite calls:
    >>> tex = r'''
    \cite{AuthorA}, \nocite{AuthorB}, \defcitealias{AuthorC}.
    \citet{AuthorD}, \citet*{AuthorE}, \Citet{AuthorF}, \Citet*{AuthorG}.
    \citep{AuthorH}, \citep*{AuthorI}, \Citep{AuthorJ}, \Citep*{AuthorK}.
    \citealt{AuthorL},     \citealt*{AuthorM},
    \Citealt{AuthorN},     \Citealt*{AuthorO}.
    \citealp{AuthorP},     \citealp*{AuthorQ},
    \Citealp{AuthorR},     \Citealp*{AuthorS}.
    \citeauthor{AuthorT},  \citeauthor*{AuthorU}.
    \Citeauthor{AuthorV},  \Citeauthor*{AuthorW}.
    \citeyear{AuthorX},    \citeyear*{AuthorY}.
    \citeyearpar{AuthorZ}, \citeyearpar*{AuthorAA}.'''
    >>> for citation in lm.citations(tex):
    >>>     print(citation, end=" ")
    AuthorA AuthorB AuthorC AuthorD AuthorE AuthorF AuthorG AuthorH AuthorI AuthorJ AuthorK AuthorL AuthorM AuthorN AuthorO AuthorP AuthorQ AuthorR AuthorS AuthorT AuthorU AuthorV AuthorW AuthorX AuthorY AuthorZ AuthorAA

    >>> texfile = os.path.expanduser('~')+"/.bibmanager/examples/sample.tex"
    >>> with open(texfile, encoding='utf-8') as f:
    >>>     tex = f.read()
    >>> tex = lm.no_comments(tex)
    >>> cites = [citation for citation in lm.citations(tex)]
    >>> for key in np.unique(cites):
    >>>     print(key)
    AASteamHendrickson2018aastex62
    Astropycollab2013aaAstropy
    Hunter2007ieeeMatplotlib
    JonesEtal2001scipy
    MeurerEtal2017pjcsSYMPY
    PerezGranger2007cseIPython
    vanderWaltEtal2011numpy
    """
    # This RE pattern matches:
    # - Latex commands: \defcitealias, \nocite, \cite
    # - Natbib commands: \cite + p, t, alp, alt, author, year, yearpar
    #                    (as well as their capitalized and starred versions).
    # - Zero or one square brackets (with everything in between).
    # - Zero or one square brackets (with everything in between).
    # - The content of the curly braces.
    # With zero or more blanks in between each expression.
    p = re.compile(
        r"\\(?:defcitealias|nocite|cite|"
        r"(?:[Cc]ite(?:p|alp|t|alt|author|year|yearpar)\*?))"
        r"[\s]*(\[[^\]]*\])?"
        r"[\s]*(\[[^\]]*\])?"
        r"[\s]*{([^}]+)"
    )
    # Parse matches, do recursive call on the brakets content, yield keys:
    for left, right, cites in p.findall(text):
        # Remove blanks, strip outer commas:
        cites = "".join(cites.split()).strip(",")

        for cite in citations(left):
            yield cite
        for cite in cites.split(","):
            yield cite
        for cite in citations(right):
            yield cite




[docs]
def parse_subtex_files(tex):
    """
    Recursively search for subfiles included in tex. Append their
    content at the end of tex and return.

    Parameters
    ----------
    tex: String
        String to parse.

    Returns
    -------
    tex: String
        String with appended content from any subfile.
    """
    # Remove blanks, strip outer commas:
    tex = no_comments(tex)
    # This RE pattern matches:
    # - The command: \input or \include or \subfile
    # - The content of the curly braces.
    # With zero or more blanks in between each expression.
    p = re.compile(r"\\(?:input|include|subfile)[\s]*{([^}]+)")
    # Parse matches, do recursive call on the brackets content, yield keys:
    for input_file in p.findall(tex):
        path, input_file = os.path.split(os.path.realpath(input_file))
        input_file, extension = os.path.splitext(input_file.strip())
        with open(f"{path}/{input_file}.tex", "r", encoding='utf-8') as f:
            input_tex = parse_subtex_files(f.read())
        tex += input_tex
    return tex




[docs]
def build_bib(texfile, bibfile=None):
    """
    Generate a .bib file from a given tex file.

    Parameters
    ----------
    texfile: String
        Name of an input tex file.
    bibfile: String
        Name of an output bib file.  If None, get bibfile name from
        bibliography call inside the tex file.

    Returns
    -------
    missing: List of strings
        List of the bibkeys not found in the bibmanager database.
    """
    # Extract path:
    path, texfile = os.path.split(os.path.realpath(texfile))
    # Remove extension:
    texfile, extension = os.path.splitext(texfile)

    if extension != ".tex":
        raise ValueError("Input file does not have a .tex extension.")

    with open(f'{path}/{texfile}.tex', "r", encoding='utf-8') as f:
        tex = f.read()

    # Start at the beginning:
    beginning = tex.find(r'\begin{document}')
    if beginning > 0:
        tex = tex[beginning:]

    # Implemented into a separate function to get recursion rolling:
    tex = parse_subtex_files(tex)

    # Extract bibfile name from texfile:
    if bibfile is None:
        bibfile = get_bibfile(f'{path}/{texfile}.tex')
        bibfile = f'{path}/{bibfile}'

    # Extract citation keys from texfile:
    cites = [citation for citation in citations(tex)]
    tex_keys = np.unique(cites)

    # Collect BibTeX references from keys in database:
    bibs = bm.load()
    db_keys = [bib.key for bib in bibs]

    found = np.isin(tex_keys, db_keys, assume_unique=True)
    missing = tex_keys[np.where(np.invert(found))]
    if not np.all(found):
        print("References not found:\n{:s}".format('\n'.join(missing)))

    bibs = [bibs[db_keys.index(key)] for key in tex_keys[found]]
    bm.export(bibs, bibfile=bibfile)

    return missing




[docs]
def update_keys(texfile, key_replacements, is_main):
    r"""
    Update citation keys in a tex file according to the replace_dict.
    Work out way recursively into sub-files.

    Parameters
    ----------
    textfile: String
        Path to an existing .tex file.
    is_main: Bool
        If True, ignore everything up to '\beging{document}' call.
    """
    with open(texfile, 'r', encoding='utf-8') as f:
        tex = f.read()
    if is_main:
        beginning = tex.find(r'\begin{document}')
    else:
        beginning = 0
    # Temporarily replace comments, keep a recod of them:
    replacer = Replacer(key_replacements)
    text = re.sub(r"\A%.*|[^\\]%.*", replacer.mask_comments, tex[beginning:])

    # See citations() for an explanation of this pattern:
    p = re.compile(
        r"\\(?:defcitealias|nocite|cite|"
        r"(?:[Cc]ite(?:p|alp|t|alt|author|year|yearpar)\*?))"
        r"[\s]*(\[[^\]]*\])?"
        r"[\s]*(\[[^\]]*\])?"
        r"[\s]*{([^}]+)"
    )
    # Reconstruct text, replacing citations as needed:
    new_text = tex[0:beginning]
    start = 0
    while True:
        match = p.search(text, start)
        if match is None:
            new_text += text[start:]
            break
        # Text up to citations:
        pos, _ = match.span(3)
        new_text += text[start:pos]
        new_text += replacer.replace(match.groups()[2])
        start = match.end()

    # Put comments back in:
    new_text = replacer.recover_comments(new_text)

    path, tfile = os.path.split(os.path.realpath(texfile))
    today = str(datetime.date.today())
    shutil.copy(
        texfile,
        f"{path}/orig_{today}_{tfile}",
    )
    with open(texfile, 'w', encoding='utf-8') as f:
        f.write(new_text)

    # Recursive calls into referenced .tex files:
    p = re.compile(r"\\(?:input|include|subfile)[\s]*{([^}]+)")
    for input_file in p.findall(tex):
        input_file = os.path.realpath(input_file)
        input_file, extension = os.path.splitext(input_file.strip())
        update_keys(f'{input_file}.tex', key_replacements, is_main=False)




[docs]
def clear_latex(texfile):
    """
    Remove by-products of previous latex compilations.

    Parameters
    ----------
    texfile: String
        Path to an existing .tex file.

    Notes
    -----
    For an input argument texfile='filename.tex', this function deletes
    the files that begin with 'filename' followed by:
      .bbl, .blg, .out, .dvi,
      .log, .aux, .lof, .lot,
      .toc, .ps,  .pdf, Notes.bib
    """
    clears = [
        '.bbl', '.blg', '.out', '.dvi',
        '.log', '.aux', '.lof', '.lot',
        '.toc', '.ps',  '.pdf', 'Notes.bib']

    # Remove extension:
    texfile = os.path.splitext(texfile)[0]

    # Delete without complaining:
    for clear in clears:
        with u.ignored(OSError):
            os.remove(f'{texfile}{clear}')




[docs]
def compile_latex(texfile, paper=None):
    """
    Compile a .tex file into a .pdf file using latex calls.

    Parameters
    ----------
    texfile: String
        Path to an existing .tex file.
    paper: String
        Paper size for output.  For example, ApJ articles use letter
        format, whereas A&A articles use A4 format.

    Notes
    -----
    This function executes the following calls:
    - compute a bibfile out of the citation calls in the .tex file.
    - removes all outputs from previous compilations (see clear_latex())
    - calls latex, bibtex, latex, latex to produce a .dvi file
    - calls dvips to produce a .ps file, redirecting the output to
      ps2pdf to produce the final .pdf file.
    """
    # Extract path:
    path, texfile = os.path.split(os.path.realpath(texfile))
    # Remove extension:
    texfile, extension = os.path.splitext(texfile)

    if extension not in [".tex", ""]:
        raise ValueError("Input file does not have a .tex extension")

    if extension == "" and os.path.isfile(f"{path}/{texfile}.tex"):
        extension = ".tex"

    if not os.path.isfile(f"{path}/{texfile}.tex"):
        raise ValueError("Input .tex file does not exist")

    # Default paper format:
    if paper is None:
        paper = cm.get('paper')

    # Proceed in place:
    with u.cd(path):
        # Re-generate bib file if necessary.
        missing = build_bib(f'{texfile}.tex')

        # Clean up:
        clear_latex(texfile)

        # Compile into dvi:
        subprocess.call(['latex',  texfile], shell=False)
        subprocess.call(['bibtex', texfile], shell=False)
        subprocess.call(['latex',  texfile], shell=False)
        subprocess.call(['latex',  texfile], shell=False)

        # dvi to pdf:
        # I could actually split the dvips and ps2pdf calls to make the code
        # easier to follow, but piping the outputs actually make it faster:
        subprocess.call(
            f'dvips -R0 -P pdf -t {paper} -f {texfile} | '
             'ps2pdf -dCompatibilityLevel=1.3 -dEmbedAllFonts=true '
            f'-dMaxSubsetPct=100 -dSubsetFonts=true - - > {texfile}.pdf',
             shell=True)
        # Some notes:
        # (1) '-P pdf' makes the file to look good on screen, says STScI:
        #     http://www.stsci.edu/hst/proposing/info/how-to-make-pdf
        # (2) See 'man ps2pdf' to understand the dashes.
        # (3) See https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDFCreationSettings_v9.pdf for ps2pdf options.

    if len(missing) > 0:
        print(f"\n{texfile}.tex has some references not found:")
        for key in missing:
            print("- " + key)




[docs]
def compile_pdflatex(texfile):
    """
    Compile a .tex file into a .pdf file using pdflatex calls.

    Parameters
    ----------
    texfile: String
        Path to an existing .tex file.

    Notes
    -----
    This function executes the following calls:
    - compute a bibfile out of the citation calls in the .tex file.
    - removes all outputs from previous compilations (see clear_latex())
    - calls pdflatex, bibtex, pdflatex, pdflatex to produce a .pdf file
    """
    # Extract path:
    path, texfile = os.path.split(os.path.realpath(texfile))
    # Remove extension:
    texfile, extension = os.path.splitext(texfile)

    if extension not in [".tex", ""]:
        raise ValueError("Input file does not have a .tex extension")

    if extension == "" and os.path.isfile(f"{path}/{texfile}.tex"):
        extension = ".tex"

    if not os.path.isfile(f"{path}/{texfile}.tex"):
        raise ValueError("Input .tex file does not exist")

    # Proceed in place:
    with u.cd(path):
        # Re-generate bib file if necessary.
        missing = build_bib(f'{texfile}.tex')

        # Clean up:
        clear_latex(texfile)

        # Compile into pdf:
        subprocess.call(['pdflatex', texfile], shell=False)
        subprocess.call(['bibtex', texfile], shell=False)
        subprocess.call(['pdflatex', texfile], shell=False)
        subprocess.call(['pdflatex', texfile], shell=False)

    if len(missing) > 0:
        print(f"\n{texfile}.tex has some references not found:")
        for key in missing:
            print("- " + key)