Spaces:

mnoorchenar
/

scopus

Sleeping

App Files Files Community

scopus / Refs /overleaf.py

mnoorchenar

Update 2026-01-30 17:28:40

581d1a0 4 months ago

raw

history blame contribute delete

18.5 kB

	"""
	📚 Reference Management Pipeline (Final Updated Version)

	Steps:
	1. Parse main.tex citations
	2. Parse Refs.bib entries
	3. Merge citations with BibTeX
	4. Save into SQLite DB (table: Refs)
	5. Enrich with Crossref BibTeX
	6. Add Journal abbreviations (custom capitalization rules)
	7. Create Crossref_BibTeX_Abbrev (fixed keys + abbrev journals)
	8. Create Crossref_BibTeX_Protected (acronyms preserved with braces)

	Author: <you>
	"""

	import re
	import time, random
	import requests
	import sqlite3
	import pandas as pd
	from difflib import SequenceMatcher

	# Prepositions to keep lowercase in abbreviations
	LOWERCASE_WORDS = {"and", "or", "in", "on", "of", "for", "to", "the", "a", "an"}


	# ---------------------------------------------------------------------
	# 1) Parse LaTeX citations
	# ---------------------------------------------------------------------
	def parse_citations_from_tex(tex_file: str) -> pd.DataFrame:
	print("📖 Parsing citations from", tex_file)
	with open(tex_file, "r", encoding="utf-8") as f:
	lines = f.readlines()
	clean_text = "\n".join(line for line in lines if not line.strip().startswith("%"))

	section_pattern = re.compile(r'\\section\{([^}])\}(?:\\label\{[^}]\})?')
	cite_pattern = re.compile(r'\\cite\{([^}]*)\}')
	sections = section_pattern.split(clean_text)

	citations, ref_sections = [], {}
	for i in range(1, len(sections), 2):
	section_name = sections[i].strip()
	section_text = sections[i+1]
	matches = cite_pattern.findall(section_text)
	for match in matches:
	for key in match.split(","):
	ref = key.strip()
	citations.append(ref)
	if ref not in ref_sections:
	ref_sections[ref] = []
	if section_name not in ref_sections[ref]:
	ref_sections[ref].append(section_name)

	freq, order = {}, []
	for c in citations:
	if c not in freq:
	order.append(c)
	freq[c] = freq.get(c, 0) + 1

	df = pd.DataFrame({
	"Reference": order,
	"Frequency": [freq[c] for c in order],
	"Sections": [", ".join(ref_sections[c]) for c in order]
	})
	print(f"✅ Found {len(df)} unique citations")
	return df


	# ---------------------------------------------------------------------
	# 2) Parse BibTeX
	# ---------------------------------------------------------------------
	def parse_bibtex_to_dataframe(bib_file: str) -> pd.DataFrame:
	print("📖 Parsing BibTeX from", bib_file)
	with open(bib_file, "r", encoding="utf-8") as f:
	content = f.read()

	entries = ["@" + e for e in content.split("@") if e.strip()]
	papers = []

	for entry in entries:
	match = re.match(r'@(\w+)\s*\{([^,]+),', entry)
	if not match:
	continue
	entry_type, entry_key = match.groups()
	fields = dict(re.findall(
	r'(\w+)\s=\s\{((?:[^{}]\|\{[^}]\}))\}',
	entry, flags=re.DOTALL
	))

	papers.append({
	"Key": entry_key,
	"Type": entry_type,
	"Authors": fields.get("author", "").strip(),
	"Title": fields.get("title", "").strip(),
	"Journal/Booktitle": fields.get("journal", fields.get("booktitle", "")).strip(),
	"Year": fields.get("year", "").strip(),
	"Publisher": fields.get("publisher", fields.get("organization", "")).strip(),
	"BibTeX": entry.strip()
	})

	df = pd.DataFrame(papers).drop_duplicates(subset="Key", keep="first").reset_index(drop=True)
	print(f"✅ Parsed {len(df)} BibTeX records")
	return df


	# ---------------------------------------------------------------------
	# 3) Merge
	# ---------------------------------------------------------------------
	def merge_citations_with_bib(main_text_df: pd.DataFrame, references_bib_df: pd.DataFrame) -> pd.DataFrame:
	print("🔗 Merging citations with BibTeX")
	bib_lookup = references_bib_df.set_index("Key").to_dict(orient="index")
	merged_records = []
	for _, row in main_text_df.iterrows():
	key = row["Reference"]
	bib_info = bib_lookup.get(key, {})
	merged_records.append({
	"Reference": key,
	"Frequency": row["Frequency"],
	"Sections": row["Sections"],
	"Type": bib_info.get("Type", ""),
	"Authors": bib_info.get("Authors", ""),
	"Title": bib_info.get("Title", ""),
	"Journal/Booktitle": bib_info.get("Journal/Booktitle", ""),
	"Year": bib_info.get("Year", ""),
	"Publisher": bib_info.get("Publisher", ""),
	"BibTeX": bib_info.get("BibTeX", "")
	})
	df = pd.DataFrame(merged_records)
	print(f"✅ Merged into {len(df)} rows")
	return df


	# ---------------------------------------------------------------------
	# 4) Save to DB
	# ---------------------------------------------------------------------
	def save_to_sqlite(df: pd.DataFrame, db_path="Refs.db", table="Refs"):
	print(f"💾 Saving DataFrame into SQLite DB: {db_path}, table={table}")
	conn = sqlite3.connect(db_path)
	df.to_sql(table, conn, if_exists="replace", index=False)
	conn.commit()
	conn.close()
	print("✅ Data saved")


	# ---------------------------------------------------------------------
	# 5) Enrich with Crossref
	# ---------------------------------------------------------------------
	def enrich_references_with_crossref(db_path="Refs.db", table="Refs", topn=3):
	print("🌐 Enriching references with Crossref metadata")
	conn = sqlite3.connect(db_path)
	cur = conn.cursor()

	cur.execute(f'PRAGMA table_info("{table}")')
	existing_cols = [col[1] for col in cur.fetchall()]
	if "Crossref_BibTeX" not in existing_cols:
	cur.execute(f'ALTER TABLE "{table}" ADD COLUMN Crossref_BibTeX TEXT;')
	if "Title_Similarity" not in existing_cols:
	cur.execute(f'ALTER TABLE "{table}" ADD COLUMN Title_Similarity INTEGER;')
	conn.commit()

	cur.execute(f'SELECT Reference, Title, Authors, "Journal/Booktitle", Year, Publisher, BibTeX FROM "{table}"')
	rows = cur.fetchall()

	for i, (ref, title, authors, journal, year, publisher, local_bib) in enumerate(rows, start=1):
	print(f"\n[{i}/{len(rows)}] Processing Reference={ref}")

	if not title:
	print("⚠️ Skipping (no title)")
	continue

	query = " ".join(filter(None, [title, authors.split(',')[0] if authors else "", journal, year, publisher]))
	url = f"https://api.crossref.org/works?query.bibliographic={requests.utils.quote(query)}&rows={topn}"

	crossref_bibtex, best_score = "", 0
	try:
	r = requests.get(url, timeout=15)
	r.raise_for_status()
	items = r.json().get("message", {}).get("items", [])

	best = None
	for item in items:
	cr_title = item.get("title", [""])[0]
	score = SequenceMatcher(None, title.lower(), cr_title.lower()).ratio()
	if score > best_score:
	best_score = score
	best = item

	best_score = int(round(best_score * 100, 0))
	if best and "DOI" in best:
	doi = best["DOI"]
	bibtex_r = requests.get(
	f"https://doi.org/{doi}",
	headers={"Accept": "application/x-bibtex"},
	timeout=15
	)
	if bibtex_r.status_code == 200:
	crossref_bibtex = bibtex_r.text.strip()

	except Exception as e:
	print(f"⚠️ Crossref fetch failed for {ref}: {e}")

	if best_score < 95:
	crossref_bibtex = local_bib
	print(f"ℹ️ Low similarity ({best_score}%), using local BibTeX")

	cur.execute(
	f'UPDATE "{table}" SET Crossref_BibTeX=?, Title_Similarity=? WHERE Reference=?',
	(crossref_bibtex, best_score, ref)
	)
	conn.commit()
	print(f"✅ Updated: Similarity={best_score}%, BibTeX length={len(crossref_bibtex)}")
	time.sleep(random.uniform(2, 5))

	conn.close()
	print("🎉 Crossref enrichment done")


	# ---------------------------------------------------------------------
	# 6) Abbreviate journals (new rules)
	# ---------------------------------------------------------------------
	def abbreviate_journal_custom(title: str) -> str:
	"""Custom abbreviation: capitalize, no dots, prepositions lowercase."""
	if not title:
	return ""
	words = title.split()
	abbr = []
	for i, word in enumerate(words):
	if word.lower() in LOWERCASE_WORDS and i != 0:
	abbr.append(word.lower())
	else:
	abbr.append(word.capitalize() if len(word) <= 4 else word[:4].capitalize())
	return " ".join(abbr)


	def add_journal_abbreviations(db_path="Refs.db", table="Refs"):
	print("🔤 Adding journal abbreviations (custom rules)")
	conn = sqlite3.connect(db_path)
	cur = conn.cursor()

	cur.execute(f'PRAGMA table_info("{table}")')
	col_names = [c[1] for c in cur.fetchall()]
	if "Journal_Abbrev" not in col_names:
	new_cols = []
	for name in col_names:
	new_cols.append(name)
	if name == "Journal/Booktitle":
	new_cols.append("Journal_Abbrev")
	col_defs = ", ".join(f'"{c}" TEXT' for c in new_cols)
	cur.execute(f'CREATE TABLE "{table}_new" ({col_defs});')
	select_expr = ", ".join([f'"{c}"' if c != "Journal_Abbrev" else "NULL" for c in new_cols])
	cur.execute(f'INSERT INTO "{table}_new" SELECT {select_expr} FROM "{table}";')
	cur.execute(f'DROP TABLE "{table}";')
	cur.execute(f'ALTER TABLE "{table}_new" RENAME TO "{table}";')
	conn.commit()

	cur.execute(f'SELECT Reference, "Journal/Booktitle" FROM "{table}"')
	for ref, journal in cur.fetchall():
	abbrev = abbreviate_journal_custom(journal)
	cur.execute(f'UPDATE "{table}" SET Journal_Abbrev=? WHERE Reference=?', (abbrev, ref))
	conn.commit()
	conn.close()
	print("✅ Journal abbreviations updated")


	# ---------------------------------------------------------------------
	# 7) Fix Crossref BibTeX with abbreviations
	# ---------------------------------------------------------------------
	def add_crossref_bibtex_with_abbrev(db_path="Refs.db", table="Refs"):
	print("🛠️ Creating Crossref_BibTeX_Abbrev")
	conn = sqlite3.connect(db_path)
	cur = conn.cursor()

	cur.execute(f'PRAGMA table_info("{table}")')
	existing_cols = [col[1] for col in cur.fetchall()]
	if "Crossref_BibTeX_Abbrev" not in existing_cols:
	cur.execute(f'ALTER TABLE "{table}" ADD COLUMN Crossref_BibTeX_Abbrev TEXT;')
	conn.commit()

	cur.execute(f'SELECT Reference, Journal_Abbrev, Crossref_BibTeX FROM "{table}"')
	for ref, journal_abbrev, crossref_bib in cur.fetchall():
	if not crossref_bib:
	continue

	new_bib = crossref_bib.strip()
	try:
	start_brace = new_bib.index("{")
	first_comma = new_bib.index(",", start_brace)
	entry_type = new_bib[:start_brace]
	new_start = f"{entry_type}{{{ref},"
	new_bib = new_start + new_bib[first_comma+1:]
	except ValueError:
	pass

	if journal_abbrev:
	new_bib = re.sub(
	r'(journal\s=\s\{)[^}]+(\})',
	rf'\1{journal_abbrev}\2',
	new_bib,
	flags=re.IGNORECASE
	)

	cur.execute(f'UPDATE "{table}" SET Crossref_BibTeX_Abbrev=? WHERE Reference=?', (new_bib, ref))
	conn.commit()
	conn.close()
	print("✅ Crossref_BibTeX_Abbrev created")


	# ---------------------------------------------------------------------
	# 8) Protect acronyms in fields (title, booktitle, journal)
	# ---------------------------------------------------------------------
	def protect_acronyms_in_fields(bibtex: str) -> str:
	"""
	Clean BibTeX fields (title, booktitle, journal):
	- strip spaces before/after content
	- wrap tokens with >=2 uppercase letters (RNN, TinyML, Grad-CAM, SHAP, IEEE, ASHRAE)
	- avoid double {{ }}
	- normalize spaces after braces
	"""

	def clean_field(field_name: str, text: str) -> str:
	text = text.strip()

	def wrap_token(token: str) -> str:
	# Already wrapped → leave it
	if token.startswith("{") and token.endswith("}"):
	return token
	# Wrap if token contains ≥2 uppercase letters
	if sum(1 for c in token if c.isupper()) >= 2:
	return "{" + token + "}"
	return token

	# Split by whitespace but keep spaces
	tokens = re.split(r'(\s+)', text)
	fixed = "".join(wrap_token(tok) if tok.strip() else tok for tok in tokens)

	# Remove accidental double braces {{...}} → {...}
	fixed = re.sub(r'\{\{([^{}]+)\}\}', r'{\1}', fixed)

	# Normalize spacing around braces
	fixed = fixed.replace("}} ", "} ").replace("{{ ", "{ ")

	return f"{field_name}={{{fixed.strip()}}}"

	# Apply to title, booktitle, journal
	for field in ["title", "booktitle", "journal"]:
	bibtex = re.sub(
	rf'{field}\s=\s\{{([^}}]*)\}}',
	lambda m: clean_field(field, m.group(1)),
	bibtex,
	flags=re.IGNORECASE
	)

	return bibtex

	def add_crossref_bibtex_with_protected_titles(db_path="Refs.db", table="Refs"):
	print("🛡️ Creating Crossref_BibTeX_Protected with acronym-safe fields")
	conn = sqlite3.connect(db_path)
	cur = conn.cursor()

	cur.execute(f'PRAGMA table_info("{table}")')
	existing_cols = [col[1] for col in cur.fetchall()]
	if "Crossref_BibTeX_Protected" not in existing_cols:
	cur.execute(f'ALTER TABLE "{table}" ADD COLUMN Crossref_BibTeX_Protected TEXT;')
	conn.commit()

	cur.execute(f'SELECT Reference, Crossref_BibTeX_Abbrev FROM "{table}"')
	for ref, bib in cur.fetchall():
	if not bib:
	continue
	protected_bib = protect_acronyms_in_fields(bib)
	cur.execute(
	f'UPDATE "{table}" SET Crossref_BibTeX_Protected=? WHERE Reference=?',
	(protected_bib, ref)
	)
	print(f"✅ Updated {ref}")

	conn.commit()
	conn.close()
	print("✅ Crossref_BibTeX_Protected created (title, booktitle, journal cleaned)")


	# ---------------------------------------------------------------------
	# Create Crossref_BibTeX_LocalKey right after Crossref_BibTeX
	# ---------------------------------------------------------------------
	def add_crossref_bibtex_with_local_keys(db_path="Refs.db", table="Refs"):
	print("🔑 Creating Crossref_BibTeX_LocalKey (after Crossref_BibTeX)")
	conn = sqlite3.connect(db_path)
	cur = conn.cursor()

	# Get existing column names
	cur.execute(f'PRAGMA table_info("{table}")')
	col_info = cur.fetchall()
	col_names = [c[1] for c in col_info]

	if "Crossref_BibTeX_LocalKey" in col_names:
	print("ℹ️ Column already exists, skipping recreation")
	conn.close()
	return

	# Define new column order
	new_cols = []
	for name in col_names:
	new_cols.append(name)
	if name == "Crossref_BibTeX":
	new_cols.append("Crossref_BibTeX_LocalKey")

	# Create new table
	col_defs = ", ".join(f'"{c}" TEXT' for c in new_cols)
	cur.execute(f'CREATE TABLE "{table}_new" ({col_defs});')

	# Copy old data into new table (LocalKey initially NULL)
	select_expr = ", ".join([f'"{c}"' if c != "Crossref_BibTeX_LocalKey" else "NULL" for c in new_cols])
	cur.execute(f'INSERT INTO "{table}_new" SELECT {select_expr} FROM "{table}";')

	# Replace table
	cur.execute(f'DROP TABLE "{table}";')
	cur.execute(f'ALTER TABLE "{table}_new" RENAME TO "{table}";')
	conn.commit()

	# Now populate LocalKey version
	cur.execute(f'SELECT Reference, Crossref_BibTeX FROM "{table}"')
	for ref, bib in cur.fetchall():
	if not bib:
	continue

	new_bib = bib.strip()
	try:
	start_brace = new_bib.index("{")
	first_comma = new_bib.index(",", start_brace)
	entry_type = new_bib[:start_brace]
	# Replace Crossref key with local key (ref)
	new_start = f"{entry_type}{{{ref},"
	new_bib = new_start + new_bib[first_comma+1:]
	except ValueError:
	pass

	cur.execute(
	f'UPDATE "{table}" SET Crossref_BibTeX_LocalKey=? WHERE Reference=?',
	(new_bib, ref)
	)
	print(f"✅ Updated {ref} with local key")

	conn.commit()
	conn.close()
	print("✅ Crossref_BibTeX_LocalKey created right after Crossref_BibTeX")


	# ---------------------------------------------------------------------
	# 9) Main entry point
	# ---------------------------------------------------------------------
	def main():
	import os
	if not os.path.exists("Refs.bib"):
	print("❌ Exiting. Refs.bib is required.")
	return

	mode = input("⚡ Run mode: [1] Only parse Refs.bib, [2] Full pipeline with main.tex ? (1/2): ").strip()

	refs_bib = parse_bibtex_to_dataframe("Refs.bib")

	if mode == "1":
	# Add an empty column "Used" (default 0)
	refs_bib["Used"] = None # You can change to None if you want it blank

	# Move it to the last column position
	cols = list(refs_bib.columns)
	cols = cols[:-1] + ["Used"]
	refs_bib = refs_bib[cols]

	save_to_sqlite(refs_bib, "Refs.db", "Refs")
	print("✅ Saved references from Refs.bib only (with 'Used' column)")
	return


	if not os.path.exists("main.tex"):
	print("❌ Exiting. main.tex is required for full pipeline.")
	return

	main_text = parse_citations_from_tex("main.tex")
	merged_df = merge_citations_with_bib(main_text, refs_bib)
	merged_df.insert(0, "Index", range(1, len(merged_df) + 1))
	save_to_sqlite(merged_df, "Refs.db", "Refs")

	enrich_references_with_crossref("Refs.db", "Refs")
	add_crossref_bibtex_with_local_keys("Refs.db", "Refs")
	add_journal_abbreviations("Refs.db", "Refs")
	add_crossref_bibtex_with_abbrev("Refs.db", "Refs")
	add_crossref_bibtex_with_protected_titles("Refs.db", "Refs")

	print("🎉 All steps completed successfully")

	if __name__ == "__main__":
	main()