# Copyright (c) 2018-2024 Patricio Cubillos.
# bibmanager is open-source software under the MIT license (see LICENSE).
__all__ = [
'guess_name',
'open',
'set_pdf',
'request_ads',
'fetch',
]
import re
import os
import sys
import shutil
import urllib
import subprocess
import webbrowser
# Be explicit about builtin to avoid conflict with pm.open
from io import open as builtin_open
import requests
from .. import bib_manager as bm
from .. import utils as u
[docs]
def guess_name(bib, arxiv=False):
r"""
Guess a PDF filename for a BibTex entry. Include at least author
and year. If entry has a bibtex, include journal info.
Parameters
----------
bib: A Bib() instance
BibTex entry to generate a PDF filename for.
arxiv: Bool
True if this PDF comes from ArXiv. If so, prepend 'arxiv_' into
the output name.
Returns
-------
guess_filename: String
Suggested name for a PDF file of the entry.
Examples
--------
>>> import bibmanager.bib_manager as bm
>>> import bibmanager.pdf_manager as pm
>>> bibs = bm.load()
>>> # Entry without bibcode:
>>> bib = bm.Bib('''@misc{AASteam2016aastex61,
>>> author = {{AAS Journals Team} and {Hendrickson}, A.},
>>> title = {AASJournals/AASTeX60: Version 6.1},
>>> year = 2016,
>>> }''')
>>> print(pm.guess_name(bib))
AASJournalsTeam2016.pdf
>>> # Entry with bibcode:
>>> bib = bm.Bib('''@ARTICLE{HuangEtal2014jqsrtCO2,
>>> author = {{Huang (黄新川)}, Xinchuan and {Gamache}, Robert R.},
>>> title = "{Reliable infrared line lists for 13 CO$_{2}$}",
>>> year = "2014",
>>> adsurl = {https://ui.adsabs.harvard.edu/abs/2014JQSRT.147..134H},
>>> }''')
>>> print(pm.guess_name(bib))
>>> Huang2014_JQSRT_147_134.pdf
>>> # Say, we are querying from ArXiv:
>>> print(pm.guess_name(bib, arxiv=True))
Huang2014_arxiv_JQSRT_147_134.pdf
"""
# Remove non-ascii and non-letter characters:
author = bib.get_authors(format='ushort')
author = author.encode('ascii', errors='ignore').decode()
author = re.sub('\W', '', author)
year = '' if bib.year is None else bib.year
guess_filename = f"{author}{year}.pdf"
if author == '' and year == '':
raise ValueError(
'Could not guess a good filename since entry does not '
'have author nor year fields')
if bib.bibcode is not None:
journal = re.sub('(\.|&)', '', bib.bibcode[4:9])
if arxiv and journal.lower() != 'arxiv':
journal = f'arxiv_{journal}'
vol = bib.bibcode[ 9:13].replace('.', '')
if vol != '':
vol = f'_{vol}'
page = bib.bibcode[14:18].replace('.', '')
if page != '':
page = f'_{page}'
guess_filename = f'{author}{year}_{journal}{vol}{page}.pdf'
return guess_filename
[docs]
def open(pdf=None, key=None, bibcode=None, pdf_file=None):
"""
Open the PDF file associated to the entry matching the input key
or bibcode argument.
Parameters
----------
pdf: String
PDF file to open. This refers to a filename located in
home/pdf/. Thus, it should not contain the file path.
key: String
Key of Bibtex entry to open it's PDF (ignored if pdf is not None).
bibcode: String
Bibcode of Bibtex entry to open it's PDF (ignored if pdf or key
is not None).
pdf_file: String
Absolute path to PDF file to open. If not None, this argument
takes precedence over pdf, key, and bibcode.
"""
if pdf is None and key is None and bibcode is None and pdf_file is None:
raise ValueError("At least one of the arguments must be not None")
# Set pdf_file:
if pdf_file is not None:
pass
elif pdf is not None:
pdf_file = u.BM_PDF() + pdf
else:
bib = bm.find(key=key, bibcode=bibcode)
if bib is None:
raise ValueError('Requested entry does not exist in database')
if bib.pdf is None:
raise ValueError('Entry does not have a PDF in the database')
pdf_file = u.BM_PDF() + bib.pdf
if not os.path.isfile(pdf_file):
path, pdf = os.path.split(pdf_file)
raise ValueError(f"Requested PDF file '{pdf}' does not exist in "
f"database PDF dir '{path}'")
# Always use default PDF viewers:
if sys.platform == "win32":
os.startfile(pdf_file)
else:
opener = "open" if sys.platform == "darwin" else "xdg-open"
subprocess.run([opener, pdf_file], stdout=subprocess.DEVNULL)
[docs]
def set_pdf(
bib, pdf=None, bin_pdf=None, filename=None,
arxiv=False, replace=False,
):
"""
Update the PDF file of the given BibTex entry in database
If pdf is not None, move the file into the database pdf folder.
Parameters
----------
bibcode: String or Bib() instance
Entry to be updated (must exist in the Bibmanager database).
If string, the ADS bibcode of key ID of the entry.
pdf: String
Path to an existing PDF file.
Only one of pdf and bin_pdf must be not None.
bin_pdf: String
PDF content in binary format (e.g., as in req.content).
Only one of pdf and bin_pdf must be not None.
arxiv: Bool
Flag indicating the source of the PDF. If True, insert
'arxiv' into a guessed name.
filename: String
Filename to assign to the PDF file. If None, take name from
pdf input argument, or else from guess_name().
replace: Bool
Replace without asking if the entry already has a PDF assigned;
else, ask the user.
Returns
-------
filename: String
If bib.pdf is not None at the end of this operation,
return the absolute path to the bib.pdf file (even if this points
to a pre-existing file).
Else, return None.
"""
if isinstance(bib, str):
e = bm.find(key=bib)
bib = bm.find(bibcode=bib) if e is None else e
if bib is None:
raise ValueError('BibTex entry is not in Bibmanager database')
if (pdf is None) + (bin_pdf is None) != 1:
raise ValueError('Exactly one of pdf or bin_pdf must be not None')
# Let's have a guess, if needed:
guess_filename = guess_name(bib, arxiv=arxiv)
if filename is None:
filename = os.path.basename(pdf) if pdf is not None else guess_filename
if not filename.lower().endswith('.pdf'):
raise ValueError('Invalid filename, must have a .pdf extension')
if os.path.dirname(filename) != '':
raise ValueError('filename must not have a path')
if pdf is not None and bib.pdf is not None:
pdf_is_not_bib_pdf = os.path.abspath(pdf) != f'{u.BM_PDF()}{bib.pdf}'
else:
pdf_is_not_bib_pdf = True
# PDF files in BM_PDF (except for the entry being fetched):
pdf_names = [
file
for file in os.listdir(u.BM_PDF())
if os.path.splitext(file)[1].lower() == '.pdf']
with u.ignored(ValueError):
pdf_names.remove(bib.pdf)
if pdf == f'{u.BM_PDF()}{filename}':
pdf_names.remove(filename)
if not replace and bib.pdf is not None and pdf_is_not_bib_pdf:
rep = u.req_input(f"Bibtex entry already has a PDF file: '{bib.pdf}' "
"Replace?\n[]yes, [n]o.\n", options=['', 'y', 'yes', 'n', 'no'])
if rep in ['n', 'no']:
return f"{u.BM_PDF()}{bib.pdf}"
while filename in pdf_names:
overwrite = input(
f"A filename '{filename}' already exists. Overwrite?\n"
f"[]yes, [n]o, or type new file name (e.g., {guess_filename}).\n")
if overwrite in ['n', 'no']:
return
elif overwrite in ['', 'y', 'yes']:
break
elif overwrite.lower().endswith('.pdf'):
filename = overwrite
# Delete pre-existing file only if not merely renaming:
if pdf is None or pdf_is_not_bib_pdf:
with u.ignored(OSError):
os.remove(f"{u.BM_PDF()}{bib.pdf}")
if pdf is not None:
shutil.move(pdf, f"{u.BM_PDF()}{filename}")
else:
with builtin_open(f"{u.BM_PDF()}{filename}", 'wb') as f:
f.write(bin_pdf)
print(f"Saved PDF to: '{u.BM_PDF()}{filename}'.")
# Update entry and database:
bibs = bm.load()
index = bibs.index(bib)
bib.pdf = filename
bibs[index] = bib
bm.save(bibs)
bm.export(bibs, meta=True)
return f"{u.BM_PDF()}{filename}"
[docs]
def request_ads(bibcode, source='journal'):
"""
Request a PDF from ADS.
Parameters
----------
bibcode: String
ADS bibcode of entry to request PDF.
source: String
Flag to indicate from which source make the request.
Choose between: 'journal', 'ads', or 'arxiv'.
Returns
-------
req: requests.Response instance
The server's response to the HTTP request.
Return None if it failed to establish a connection.
Note
----
If the request succeeded, but the response content is not a PDF,
this function modifies the value of req.status_code (in a desperate
attempt to give a meaningful answer).
Examples
--------
>>> import bibmanager.pdf_manager as pm
>>> bibcode = '2017AJ....153....3C'
>>> req = pm.request_ads(bibcode)
>>> # On successful request, you can save the PDF file as, e.g.:
>>> with open('fetched_file.pdf', 'wb') as f:
>>> f.write(r.content)
>>> # Nature articles are not directly accessible from Journal:
>>> bibcode = '2018NatAs...2..220D'
>>> req = pm.request_ads(bibcode)
Request failed with status code 404: NOT FOUND
>>> # Get ArXiv instead:
>>> req = pm.request_ads(bibcode, source='arxiv')
"""
sources = {
'journal': 'PUB_PDF',
'arxiv': 'EPRINT_PDF',
'ads': 'ADS_PDF',
}
if source not in sources:
raise ValueError(f"Source argument must be one of {list(sources)}")
query = ('https://ui.adsabs.harvard.edu/link_gateway/'
f'{urllib.parse.quote(bibcode)}/{sources[source]}')
# This fixed MNRAS requests and CAPTCHA issues:
# (take from https://stackoverflow.com/questions/43165341)
headers = requests.utils.default_headers()
headers['User-Agent'] = (
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
# Make the request:
try:
req = requests.get(query, headers=headers)
except requests.exceptions.ConnectionError:
print('Failed to establish a web connection.')
return None
if not req.ok:
print(f'Request failed with status code {req.status_code}: '
f'{req.reason}')
elif req.headers['Content-Type'].startswith('text/html') \
and 'CAPTCHA' in req.content.decode():
browse = u.req_input('There are issues with CAPTCHA verification, '
'try to open PDF in browser?\n[]yes [n]o.\n',
options=['', 'y', 'yes', 'n', 'no'])
if browse in ['', 'y', 'yes']:
webbrowser.open(query, new=2)
print("\nIf you managed to download the PDF, add the PDF into "
"the database\nwith the following command (and right path):\n"
f"bibm pdf '{bibcode}' PATH/TO/FILE.pdf guess")
req.status_code = -101
else:
req.status_code = -102
# Request is OK, but output is not a PDF:
elif not req.headers['Content-Type'].startswith('application/pdf'):
print('Request succeeded, but fetched content is not a PDF (might '
'have been\nredirected to website due to paywall).')
req.status_code = -100
return req
[docs]
def fetch(bibcode, filename=None, replace=None):
"""
Attempt to fetch a PDF file from ADS. If successful, then
add it into the database. If the fetch succeeds but the bibcode is
not in the database, download file to current folder.
Parameters
----------
bibcode: String
ADS bibcode of entry to update.
filename: String
Filename to assign to the PDF file. If None, get from
guess_name() function.
Replace: Bool
If True, enforce replacing a PDF regardless of a pre-existing one.
If None (default), only ask when fetched PDF comes from arxiv.
Returns
-------
filename: String
If successful, return the full path of the file name.
If not, return None.
"""
arxiv = False
print('Fetching PDF file from Journal website:')
req = request_ads(bibcode, source='journal')
if req is None:
return
if req.status_code != 200:
print('Fetching PDF file from ADS website:')
req = request_ads(bibcode, source='ads')
if req is None:
return
if req.status_code != 200:
print('Fetching PDF file from ArXiv website:')
req = request_ads(bibcode, source='arxiv')
arxiv = True
if replace is None:
replace = False
if req is None:
return
if replace is None:
replace = True
if req.status_code == 200:
if bm.find(bibcode=bibcode) is None:
if filename is None:
filename = f'{bibcode}.pdf'
with builtin_open(filename, 'wb') as f:
f.write(req.content)
print(f"Saved PDF to: '{filename}'.\n"
"(Note that BibTex entry is not in the Bibmanager database)")
else:
filename = set_pdf(
bibcode, bin_pdf=req.content, filename=filename, arxiv=arxiv,
replace=replace)
return filename
print('Could not fetch PDF from any source.')