pubmed to bibtex converter

unlisted

Guest May 11, 2025 23 days 11

Python paste1.py 141 lines (121 loc) | 5.24 KB

Raw

1 #V2.0
2 #by m36-intj: makes bibtex entries from a list of pubmed article pmids, uses pmc article link, if available
3 #fixed the problem with sometimes wrong publication date, uses now biopython instead of pymed to get publication date
4 #same with title, uses now biopython for titles
5 #added unicode to latex translation of author names
6 #uses now regex special rules for title conversion to latex format
7  
8 #input list file format is
9 #%comment
10 #12345678
11 #23415251
12 #....
13  
14 from pymed import PubMed
15 pubmed = PubMed()
16 from Bio import Entrez
17 from Bio import Medline
18 from pylatexenc.latexencode import unicode_to_latex
19 from pylatexenc.latexencode import UnicodeToLatexEncoder, UnicodeToLatexConversionRule, RULE_REGEX
20 import re
21  
22 Entrez.email = "[email protected]"  # Required by NCBI
23  
24 def test():
25     output_file = open('/storage/emulated/0/Download/python/bibtex.txt', "w")
26     pmids= open('/storage/emulated/0/Download/python/pmids', "r")
27     
28     for pmid in pmids:
29     	if pmid.startswith("%"):
30     		continue
31     	pmid = pmid.strip()
32     	results = pubmed.query(f"{pmid}[PMID]", max_results=1)
33     	article = next(results, None)
34     	if article:
35     	   print(f"PMID: {pmid}")
36     	   title = fetch_title(pmid)
37     	   title = special_convert(title)
38     	   print(f"Title: {title}")
39     	   raw_id = str(article.pubmed_id)
40     	   clean_pmid = raw_id.split()[0]  # Takes first entry if multiple exist
41     	   
42     	   first_author = article.authors[0]  # Get first author dict
43     	   first_author_name = f"{first_author.get('firstname', '')} {first_author.get('lastname', '')}".strip()
44     	   #translate to latex syntax
45     	   first_author_name = unicode_to_latex(first_author_name)
46     	   print(f"First Author: {first_author_name}")
47     	   
48     	   url = get_pmc(clean_pmid)
49     	   print(f"URL: {url}")
50     	   # Extract year 
51     	   handle = Entrez.esummary(db="pubmed", id=clean_pmid)
52     	   record = Entrez.read(handle)
53     	   handle.close()
54     	   year=record[0]["PubDate"].split()[0]
55     	   print(f"Publication year: {year}")
56     
57     	   # get ISO abbreviation for journal name
58     	   abbrev = get_journal_abbreviation(clean_pmid)  # Replace with your PMID
59     	   print(f"ISO Abbreviation: {abbrev}")  # e.g., "Nat. Biotechnol."
60     	   #make a bibtex entry
61     	   bibtex_entry(output_file,first_author_name, url, title, abbrev, year)
62     	   print("")
63     	else:
64     		print(f"\nPMID {pmid} not found.")
65     
66 # Example:
67 def get_pmc(pmid):
68         try:
69         	handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid)
70         	record = Entrez.read(handle)
71         	if record and "LinkSetDb" in record[0] and record[0]["LinkSetDb"]:
72         	   	pmcid = record[0]["LinkSetDb"][0]["Link"][0]["Id"]
73         	   	return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmcid}/"
74         except Exception as e:
75         	print(f"Error fetching PMCID for PMID {pmid}: {str(e)}")
76         return  f"https://pubmed.ncbi.nlm.nih.gov/{pmid}"
77     
78  
79 def get_journal_abbreviation(pmid):
80     
81     try:
82         # Step 1: Fetch record in Medline format (more reliable than XML)
83         handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
84         record = Medline.read(handle)
85         
86         # Step 2: Check all possible journal name fields
87         journal_fields = ['TA', 'JT', 'SO']
88         for field in journal_fields:
89             if field in record:
90                 return record[field]
91         
92         # Step 3: Ultimate fallback
93         return record.get('SO', 'Journal name unavailable')
94     
95     except Exception as e:
96         print(f"Error processing PMID {pmid}: {str(e)}")
97         return "Journal name unavailable"
98  
99 def fetch_title(pmid):
100     """Fetch article title for a given PMID."""
101     try:
102         # Fetch XML data for the PMID
103         handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
104         record = Entrez.read(handle)
105         
106         # Extract title from the nested XML structure
107         if "PubmedArticle" in record:
108             article = record["PubmedArticle"][0]
109             title = article["MedlineCitation"]["Article"]["ArticleTitle"]
110             return title
111     except Exception as e:
112         print(f"Error fetching title for PMID {pmid}: {e}")
113     return None
114  
115  #this converts strings to LaTex format, there might be special rules which are needed e.g. <i>text</i> -> \textit{text}
116 def special_convert(string):
117 	u = UnicodeToLatexEncoder(
118 	conversion_rules=[
119 	UnicodeToLatexConversionRule(rule_type=RULE_REGEX, rule=[
120 	(re.compile(r'<i>'), r'\\textit{'),
121 	(re.compile(r'</i>'), r'}'),
122 	]),
123 	'defaults'
124 	])
125 	return u.unicode_to_latex(string)
126 	
127 #@Article{hallmarks_aging,
128 # Author         = "Carlos Lopez-Otin and others",
129 # Title          = "\href{https://pubmed.ncbi.nlm.nih.gov/36599349/}{Hallmarks of aging: An expanding universe}",
130 # Journal        = "Cell Metabolism",
131 # Year           = 2023,
132 #}
133 def bibtex_entry(text_file, author, url, title, journal, year):
134 		print(f"@Article {{key,",file=text_file)
135 		print(f"Author = \"{author} and others\",",file=text_file)
136 		print(f"Title = \"\\href{{{url}}}{{{title}}}\",",file=text_file)
137 		print(f"Journal = \"{journal}.\",",file=text_file)
138 		print(f"Year = {year},\n}}\n",file=text_file)
139 		return
140  
141 test()

1	#V2.0
2	#by m36-intj: makes bibtex entries from a list of pubmed article pmids, uses pmc article link, if available
3	#fixed the problem with sometimes wrong publication date, uses now biopython instead of pymed to get publication date
4	#same with title, uses now biopython for titles
5	#added unicode to latex translation of author names
6	#uses now regex special rules for title conversion to latex format
7
8	#input list file format is
9	#%comment
10	#12345678
11	#23415251
12	#....
13
14	from pymed import PubMed
15	pubmed = PubMed()
16	from Bio import Entrez
17	from Bio import Medline
18	from pylatexenc.latexencode import unicode_to_latex
19	from pylatexenc.latexencode import UnicodeToLatexEncoder, UnicodeToLatexConversionRule, RULE_REGEX
20	import re
21
22	Entrez.email = "[email protected]" # Required by NCBI
23
24	def test():
25	output_file = open('/storage/emulated/0/Download/python/bibtex.txt', "w")
26	pmids= open('/storage/emulated/0/Download/python/pmids', "r")
27
28	for pmid in pmids:
29	if pmid.startswith("%"):
30	continue
31	pmid = pmid.strip()
32	results = pubmed.query(f"{pmid}[PMID]", max_results=1)
33	article = next(results, None)
34	if article:
35	print(f"PMID: {pmid}")
36	title = fetch_title(pmid)
37	title = special_convert(title)
38	print(f"Title: {title}")
39	raw_id = str(article.pubmed_id)
40	clean_pmid = raw_id.split()[0] # Takes first entry if multiple exist
41
42	first_author = article.authors[0] # Get first author dict
43	first_author_name = f"{first_author.get('firstname', '')} {first_author.get('lastname', '')}".strip()
44	#translate to latex syntax
45	first_author_name = unicode_to_latex(first_author_name)
46	print(f"First Author: {first_author_name}")
47
48	url = get_pmc(clean_pmid)
49	print(f"URL: {url}")
50	# Extract year
51	handle = Entrez.esummary(db="pubmed", id=clean_pmid)
52	record = Entrez.read(handle)
53	handle.close()
54	year=record[0]["PubDate"].split()[0]
55	print(f"Publication year: {year}")
56
57	# get ISO abbreviation for journal name
58	abbrev = get_journal_abbreviation(clean_pmid) # Replace with your PMID
59	print(f"ISO Abbreviation: {abbrev}") # e.g., "Nat. Biotechnol."
60	#make a bibtex entry
61	bibtex_entry(output_file,first_author_name, url, title, abbrev, year)
62	print("")
63	else:
64	print(f"\nPMID {pmid} not found.")
65
66	# Example:
67	def get_pmc(pmid):
68	try:
69	handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid)
70	record = Entrez.read(handle)
71	if record and "LinkSetDb" in record[0] and record[0]["LinkSetDb"]:
72	pmcid = record[0]["LinkSetDb"][0]["Link"][0]["Id"]
73	return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmcid}/"
74	except Exception as e:
75	print(f"Error fetching PMCID for PMID {pmid}: {str(e)}")
76	return f"https://pubmed.ncbi.nlm.nih.gov/{pmid}"
77
78
79	def get_journal_abbreviation(pmid):
80
81	try:
82	# Step 1: Fetch record in Medline format (more reliable than XML)
83	handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
84	record = Medline.read(handle)
85
86	# Step 2: Check all possible journal name fields
87	journal_fields = ['TA', 'JT', 'SO']
88	for field in journal_fields:
89	if field in record:
90	return record[field]
91
92	# Step 3: Ultimate fallback
93	return record.get('SO', 'Journal name unavailable')
94
95	except Exception as e:
96	print(f"Error processing PMID {pmid}: {str(e)}")
97	return "Journal name unavailable"
98
99	def fetch_title(pmid):
100	"""Fetch article title for a given PMID."""
101	try:
102	# Fetch XML data for the PMID
103	handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
104	record = Entrez.read(handle)
105
106	# Extract title from the nested XML structure
107	if "PubmedArticle" in record:
108	article = record["PubmedArticle"][0]
109	title = article["MedlineCitation"]["Article"]["ArticleTitle"]
110	return title
111	except Exception as e:
112	print(f"Error fetching title for PMID {pmid}: {e}")
113	return None
114
115	#this converts strings to LaTex format, there might be special rules which are needed e.g. <i>text</i> -> \textit{text}
116	def special_convert(string):
117	u = UnicodeToLatexEncoder(
118	conversion_rules=[
119	UnicodeToLatexConversionRule(rule_type=RULE_REGEX, rule=[
120	(re.compile(r'<i>'), r'\\textit{'),
121	(re.compile(r'</i>'), r'}'),
122	]),
123	'defaults'
124	])
125	return u.unicode_to_latex(string)
126
127	#@Article{hallmarks_aging,
128	# Author = "Carlos Lopez-Otin and others",
129	# Title = "\href{https://pubmed.ncbi.nlm.nih.gov/36599349/}{Hallmarks of aging: An expanding universe}",
130	# Journal = "Cell Metabolism",
131	# Year = 2023,
132	#}
133	def bibtex_entry(text_file, author, url, title, journal, year):
134	print(f"@Article {{key,",file=text_file)
135	print(f"Author = \"{author} and others\",",file=text_file)
136	print(f"Title = \"\\href{{{url}}}{{{title}}}\",",file=text_file)
137	print(f"Journal = \"{journal}.\",",file=text_file)
138	print(f"Year = {year},\n}}\n",file=text_file)
139	return
140
141	test()