G

pubmed to bibtex converter

unlisted
Guest May 11, 2025 23 days 11
Clone
Python paste1.py 141 lines (121 loc) | 5.24 KB
1
#V2.0
2
#by m36-intj: makes bibtex entries from a list of pubmed article pmids, uses pmc article link, if available
3
#fixed the problem with sometimes wrong publication date, uses now biopython instead of pymed to get publication date
4
#same with title, uses now biopython for titles
5
#added unicode to latex translation of author names
6
#uses now regex special rules for title conversion to latex format
7
8
#input list file format is
9
#%comment
10
#12345678
11
#23415251
12
#....
13
14
from pymed import PubMed
15
pubmed = PubMed()
16
from Bio import Entrez
17
from Bio import Medline
18
from pylatexenc.latexencode import unicode_to_latex
19
from pylatexenc.latexencode import UnicodeToLatexEncoder, UnicodeToLatexConversionRule, RULE_REGEX
20
import re
21
22
Entrez.email = "[email protected]" # Required by NCBI
23
24
def test():
25
output_file = open('/storage/emulated/0/Download/python/bibtex.txt', "w")
26
pmids= open('/storage/emulated/0/Download/python/pmids', "r")
27
28
for pmid in pmids:
29
if pmid.startswith("%"):
30
continue
31
pmid = pmid.strip()
32
results = pubmed.query(f"{pmid}[PMID]", max_results=1)
33
article = next(results, None)
34
if article:
35
print(f"PMID: {pmid}")
36
title = fetch_title(pmid)
37
title = special_convert(title)
38
print(f"Title: {title}")
39
raw_id = str(article.pubmed_id)
40
clean_pmid = raw_id.split()[0] # Takes first entry if multiple exist
41
42
first_author = article.authors[0] # Get first author dict
43
first_author_name = f"{first_author.get('firstname', '')} {first_author.get('lastname', '')}".strip()
44
#translate to latex syntax
45
first_author_name = unicode_to_latex(first_author_name)
46
print(f"First Author: {first_author_name}")
47
48
url = get_pmc(clean_pmid)
49
print(f"URL: {url}")
50
# Extract year
51
handle = Entrez.esummary(db="pubmed", id=clean_pmid)
52
record = Entrez.read(handle)
53
handle.close()
54
year=record[0]["PubDate"].split()[0]
55
print(f"Publication year: {year}")
56
57
# get ISO abbreviation for journal name
58
abbrev = get_journal_abbreviation(clean_pmid) # Replace with your PMID
59
print(f"ISO Abbreviation: {abbrev}") # e.g., "Nat. Biotechnol."
60
#make a bibtex entry
61
bibtex_entry(output_file,first_author_name, url, title, abbrev, year)
62
print("")
63
else:
64
print(f"\nPMID {pmid} not found.")
65
66
# Example:
67
def get_pmc(pmid):
68
try:
69
handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid)
70
record = Entrez.read(handle)
71
if record and "LinkSetDb" in record[0] and record[0]["LinkSetDb"]:
72
pmcid = record[0]["LinkSetDb"][0]["Link"][0]["Id"]
73
return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmcid}/"
74
except Exception as e:
75
print(f"Error fetching PMCID for PMID {pmid}: {str(e)}")
76
return f"https://pubmed.ncbi.nlm.nih.gov/{pmid}"
77
78
79
def get_journal_abbreviation(pmid):
80
81
try:
82
# Step 1: Fetch record in Medline format (more reliable than XML)
83
handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
84
record = Medline.read(handle)
85
86
# Step 2: Check all possible journal name fields
87
journal_fields = ['TA', 'JT', 'SO']
88
for field in journal_fields:
89
if field in record:
90
return record[field]
91
92
# Step 3: Ultimate fallback
93
return record.get('SO', 'Journal name unavailable')
94
95
except Exception as e:
96
print(f"Error processing PMID {pmid}: {str(e)}")
97
return "Journal name unavailable"
98
99
def fetch_title(pmid):
100
"""Fetch article title for a given PMID."""
101
try:
102
# Fetch XML data for the PMID
103
handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
104
record = Entrez.read(handle)
105
106
# Extract title from the nested XML structure
107
if "PubmedArticle" in record:
108
article = record["PubmedArticle"][0]
109
title = article["MedlineCitation"]["Article"]["ArticleTitle"]
110
return title
111
except Exception as e:
112
print(f"Error fetching title for PMID {pmid}: {e}")
113
return None
114
115
#this converts strings to LaTex format, there might be special rules which are needed e.g. <i>text</i> -> \textit{text}
116
def special_convert(string):
117
u = UnicodeToLatexEncoder(
118
conversion_rules=[
119
UnicodeToLatexConversionRule(rule_type=RULE_REGEX, rule=[
120
(re.compile(r'<i>'), r'\\textit{'),
121
(re.compile(r'</i>'), r'}'),
122
]),
123
'defaults'
124
])
125
return u.unicode_to_latex(string)
126
127
#@Article{hallmarks_aging,
128
# Author = "Carlos Lopez-Otin and others",
129
# Title = "\href{https://pubmed.ncbi.nlm.nih.gov/36599349/}{Hallmarks of aging: An expanding universe}",
130
# Journal = "Cell Metabolism",
131
# Year = 2023,
132
#}
133
def bibtex_entry(text_file, author, url, title, journal, year):
134
print(f"@Article {{key,",file=text_file)
135
print(f"Author = \"{author} and others\",",file=text_file)
136
print(f"Title = \"\\href{{{url}}}{{{title}}}\",",file=text_file)
137
print(f"Journal = \"{journal}.\",",file=text_file)
138
print(f"Year = {year},\n}}\n",file=text_file)
139
return
140
141
test()