Spaces:
Runtime error
Runtime error
| """ | |
| Flask Application for Reference Management Pipeline | |
| Complete implementation matching overleaf.py functionality | |
| Includes LaTeX citation parsing, BibTeX processing, and full pipeline | |
| FEATURES: | |
| - Fixed abbreviations to include periods (ISO 4 standard): "Energy" β "Ener." | |
| - LaTeX manuscript analysis for citation frequency and section tracking | |
| - Filter references by section | |
| - Clear entire database | |
| """ | |
| from flask import Flask, render_template, request, jsonify, send_file | |
| import sqlite3 | |
| import pandas as pd | |
| import os | |
| import json | |
| from datetime import datetime | |
| import re | |
| from difflib import SequenceMatcher | |
| import requests | |
| from requests.adapters import HTTPAdapter | |
| from urllib3.util.retry import Retry | |
| import io | |
| import time | |
| import random | |
| import hashlib | |
| from werkzeug.utils import secure_filename | |
| app = Flask(__name__) | |
| app.config['DATABASE'] = 'refs_management.db' | |
| app.config['API_KEY'] = os.environ.get('API_KEY', 'your-secret-key-here') | |
| app.config['ENVIRONMENT'] = os.environ.get('ENVIRONMENT', 'development') | |
| app.config['UPLOAD_FOLDER'] = '/tmp/uploads' | |
| app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max | |
| # Create upload folder | |
| os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) | |
| # Prepositions to keep lowercase in abbreviations | |
| LOWERCASE_WORDS = {"and", "or", "in", "on", "of", "for", "to", "the", "a", "an", "with", "at", "by", "from"} | |
| # ===================================================================== | |
| # HTTP CLIENT WITH RETRIES | |
| # ===================================================================== | |
| def make_http_session(): | |
| """Create HTTP session with retries and proper headers""" | |
| s = requests.Session() | |
| retry = Retry( | |
| total=5, | |
| backoff_factor=0.8, | |
| status_forcelist=[429, 500, 502, 503, 504], | |
| allowed_methods=["GET"] | |
| ) | |
| s.mount("https://", HTTPAdapter(max_retries=retry)) | |
| s.headers.update({ | |
| "User-Agent": "RefsManagement/1.0 (mailto:contact@example.com)", | |
| "Accept": "application/json", | |
| }) | |
| return s | |
| HTTP = make_http_session() | |
| # ===================================================================== | |
| # AUTHENTICATION | |
| # ===================================================================== | |
| def check_api_key(): | |
| """Check API key for protected routes""" | |
| if app.config['ENVIRONMENT'] == 'development': | |
| return True | |
| api_key = request.headers.get('X-API-Key') | |
| return api_key == app.config['API_KEY'] | |
| # ===================================================================== | |
| # LATEX CITATION PARSING (from overleaf.py) | |
| # ===================================================================== | |
| def parse_citations_from_tex(tex_content: str) -> pd.DataFrame: | |
| """Parse citations from LaTeX content with section tracking""" | |
| print("π Parsing citations from LaTeX") | |
| lines = tex_content.split('\n') | |
| clean_text = "\n".join(line for line in lines if not line.strip().startswith("%")) | |
| section_pattern = re.compile(r'\\section\{([^}]*)\}(?:\\label\{[^}]*\})?') | |
| cite_pattern = re.compile(r'\\cite\{([^}]*)\}') | |
| sections = section_pattern.split(clean_text) | |
| citations, ref_sections = [], {} | |
| for i in range(1, len(sections), 2): | |
| if i >= len(sections): | |
| break | |
| section_name = sections[i].strip() | |
| section_text = sections[i+1] if i+1 < len(sections) else "" | |
| matches = cite_pattern.findall(section_text) | |
| for match in matches: | |
| for key in match.split(","): | |
| ref = key.strip() | |
| citations.append(ref) | |
| if ref not in ref_sections: | |
| ref_sections[ref] = [] | |
| if section_name not in ref_sections[ref]: | |
| ref_sections[ref].append(section_name) | |
| freq, order = {}, [] | |
| for c in citations: | |
| if c not in freq: | |
| order.append(c) | |
| freq[c] = freq.get(c, 0) + 1 | |
| df = pd.DataFrame({ | |
| "Reference": order, | |
| "Frequency": [freq[c] for c in order], | |
| "Sections": [", ".join(ref_sections[c]) for c in order] | |
| }) | |
| print(f"β Found {len(df)} unique citations") | |
| return df | |
| def merge_citations_with_bib(citations_df: pd.DataFrame, bib_df: pd.DataFrame) -> pd.DataFrame: | |
| """Merge citations with BibTeX entries""" | |
| print("π Merging citations with BibTeX") | |
| bib_lookup = bib_df.set_index("Key").to_dict(orient="index") | |
| merged_records = [] | |
| for _, row in citations_df.iterrows(): | |
| key = row["Reference"] | |
| bib_info = bib_lookup.get(key, {}) | |
| merged_records.append({ | |
| "Reference": key, | |
| "Frequency": row["Frequency"], | |
| "Sections": row["Sections"], | |
| "Type": bib_info.get("Type", ""), | |
| "Authors": bib_info.get("Authors", ""), | |
| "Title": bib_info.get("Title", ""), | |
| "Journal/Booktitle": bib_info.get("Journal/Booktitle", ""), | |
| "Year": bib_info.get("Year", ""), | |
| "Publisher": bib_info.get("Publisher", ""), | |
| "Volume": bib_info.get("Volume", ""), | |
| "Pages": bib_info.get("Pages", ""), | |
| "DOI": bib_info.get("DOI", ""), | |
| "BibTeX": bib_info.get("BibTeX", "") | |
| }) | |
| df = pd.DataFrame(merged_records) | |
| print(f"β Merged into {len(df)} rows") | |
| return df | |
| # ===================================================================== | |
| # UTILITY FUNCTIONS | |
| # ===================================================================== | |
| def abbreviate_journal_custom(title: str) -> str: | |
| """ | |
| Custom abbreviation with periods (ISO 4 standard) | |
| Examples: | |
| "Energy" β "Ener." | |
| "Applied Energy" β "Appl. Ener." | |
| "Journal of Energy" β "J. of Ener." | |
| "IEEE Transactions on Neural Networks" β "IEEE Trans. on Neural Netw." | |
| """ | |
| if not title: | |
| return "" | |
| words = title.split() | |
| abbr = [] | |
| for i, word in enumerate(words): | |
| # Keep prepositions and conjunctions in lowercase (except at start) | |
| if word.lower() in LOWERCASE_WORDS and i != 0: | |
| abbr.append(word.lower()) | |
| # Keep acronyms as-is (words with 2+ uppercase letters) | |
| elif sum(1 for c in word if c.isupper()) >= 2: | |
| abbr.append(word) | |
| # Abbreviate long words (>4 letters) with period | |
| elif len(word) > 4: | |
| # Take first 4 letters and add period | |
| abbr.append(word[:4].capitalize() + ".") | |
| # Short words (β€4 letters): add period | |
| else: | |
| abbr.append(word.capitalize() + ".") | |
| return " ".join(abbr) | |
| def get_db_connection(): | |
| """Get database connection""" | |
| conn = sqlite3.connect(app.config['DATABASE']) | |
| conn.row_factory = sqlite3.Row | |
| return conn | |
| def init_db(): | |
| """Initialize database with proper schema""" | |
| conn = get_db_connection() | |
| cur = conn.cursor() | |
| # Create table with all columns in proper order | |
| cur.execute(""" | |
| CREATE TABLE IF NOT EXISTS bibliography ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| index_num INTEGER, | |
| session_id TEXT, | |
| reference TEXT, | |
| frequency INTEGER, | |
| sections TEXT, | |
| key TEXT UNIQUE, | |
| doi TEXT, | |
| type TEXT, | |
| authors TEXT, | |
| title TEXT, | |
| journal_booktitle TEXT, | |
| year TEXT, | |
| year_int INTEGER, | |
| publisher TEXT, | |
| volume TEXT, | |
| pages TEXT, | |
| bibtex TEXT, | |
| crossref_bibtex TEXT, | |
| crossref_bibtex_localkey TEXT, | |
| title_similarity INTEGER, | |
| journal_abbreviation TEXT, | |
| crossref_bibtex_abbrev TEXT, | |
| crossref_bibtex_protected TEXT, | |
| used TEXT, | |
| imported_date TEXT, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| """) | |
| # Create index on DOI | |
| cur.execute(""" | |
| CREATE INDEX IF NOT EXISTS idx_bib_doi | |
| ON bibliography(doi) | |
| WHERE doi IS NOT NULL AND doi != '' | |
| """) | |
| conn.commit() | |
| conn.close() | |
| def extract_year_int(year_str): | |
| """Extract integer year from year string""" | |
| if not year_str: | |
| return None | |
| match = re.search(r'\d{4}', str(year_str)) | |
| return int(match.group()) if match else None | |
| def scan_brace_balanced_value(text, start_pos): | |
| """Scan for brace-balanced field value""" | |
| if start_pos >= len(text): | |
| return "", start_pos | |
| if text[start_pos] == '{': | |
| depth = 1 | |
| pos = start_pos + 1 | |
| while pos < len(text) and depth > 0: | |
| if text[pos] == '{': | |
| depth += 1 | |
| elif text[pos] == '}': | |
| depth -= 1 | |
| pos += 1 | |
| return text[start_pos+1:pos-1], pos | |
| elif text[start_pos] == '"': | |
| pos = start_pos + 1 | |
| while pos < len(text): | |
| if text[pos] == '"' and text[pos-1] != '\\': | |
| return text[start_pos+1:pos], pos + 1 | |
| pos += 1 | |
| return text[start_pos+1:], len(text) | |
| else: | |
| pos = start_pos | |
| while pos < len(text) and text[pos] not in ',}': | |
| pos += 1 | |
| return text[start_pos:pos].strip(), pos | |
| def parse_bibtex_entry(entry_text): | |
| """Parse single BibTeX entry with proper brace balancing""" | |
| match = re.match(r'@(\w+)\s*\{([^,]+),', entry_text) | |
| if not match: | |
| return None | |
| entry_type, entry_key = match.groups() | |
| fields = {} | |
| fields_start = entry_text.find(entry_key) + len(entry_key) + 1 | |
| fields_text = entry_text[fields_start:] | |
| pos = 0 | |
| while pos < len(fields_text): | |
| while pos < len(fields_text) and fields_text[pos] in ' \t\n\r,': | |
| pos += 1 | |
| if pos >= len(fields_text) or fields_text[pos] == '}': | |
| break | |
| field_match = re.match(r'(\w+)\s*=\s*', fields_text[pos:]) | |
| if not field_match: | |
| break | |
| field_name = field_match.group(1).lower() | |
| pos += field_match.end() | |
| value, new_pos = scan_brace_balanced_value(fields_text, pos) | |
| fields[field_name] = value.strip() | |
| pos = new_pos | |
| return { | |
| 'type': entry_type, | |
| 'key': entry_key.strip(), | |
| 'fields': fields | |
| } | |
| def parse_bibtex_input(bibtex_content): | |
| """Parse BibTeX content from user input""" | |
| entries = ["@" + e for e in bibtex_content.split("@") if e.strip()] | |
| papers = [] | |
| for entry in entries: | |
| parsed = parse_bibtex_entry(entry) | |
| if not parsed: | |
| continue | |
| fields = parsed['fields'] | |
| papers.append({ | |
| "Key": parsed['key'], | |
| "Type": parsed['type'], | |
| "Authors": fields.get("author", "").strip(), | |
| "Title": fields.get("title", "").strip(), | |
| "Journal/Booktitle": fields.get("journal", fields.get("booktitle", "")).strip(), | |
| "Year": fields.get("year", "").strip(), | |
| "Publisher": fields.get("publisher", fields.get("organization", "")).strip(), | |
| "Volume": fields.get("volume", "").strip(), | |
| "Pages": fields.get("pages", "").strip(), | |
| "DOI": fields.get("doi", "").strip(), | |
| "BibTeX": entry.strip(), | |
| "Imported_Date": datetime.now().isoformat() | |
| }) | |
| return pd.DataFrame(papers).drop_duplicates(subset="Key", keep="first").reset_index(drop=True) | |
| def clean_bibtex_fields(bibtex): | |
| """Remove unwanted fields from BibTeX entries""" | |
| if not bibtex: | |
| return bibtex | |
| fields_to_remove = ['url', 'source', 'publication_stage', 'note', 'abstract'] | |
| for field in fields_to_remove: | |
| pattern = rf'\s*{field}\s*=\s*' | |
| pos = 0 | |
| result = [] | |
| while pos < len(bibtex): | |
| match = re.search(pattern, bibtex[pos:], re.IGNORECASE) | |
| if not match: | |
| result.append(bibtex[pos:]) | |
| break | |
| result.append(bibtex[pos:pos + match.start()]) | |
| value_start = pos + match.end() | |
| _, value_end = scan_brace_balanced_value(bibtex, value_start) | |
| while value_end < len(bibtex) and bibtex[value_end] in ' \t\n\r,': | |
| value_end += 1 | |
| pos = value_end | |
| bibtex = ''.join(result) | |
| bibtex = re.sub(r'\n\s*\n\s*\n', '\n\n', bibtex) | |
| bibtex = re.sub(r',\s*,', ',', bibtex) | |
| bibtex = re.sub(r',(\s*)\}', r'\1}', bibtex) | |
| lines = [line for line in bibtex.split('\n') if line.strip()] | |
| return '\n'.join(lines) | |
| def protect_acronyms_in_fields(bibtex): | |
| """Protect acronyms with braces""" | |
| if not bibtex: | |
| return bibtex | |
| def wrap_token(token): | |
| if token.startswith("{") and token.endswith("}"): | |
| return token | |
| if sum(1 for c in token if c.isupper()) >= 2: | |
| return "{" + token + "}" | |
| return token | |
| def process_field_value(value): | |
| if value.startswith("{") and value.endswith("}"): | |
| inner = value[1:-1] | |
| if not ('{' in inner and '}' in inner): | |
| return value | |
| tokens = re.split(r'(\s+)', value) | |
| fixed = "".join(wrap_token(tok) if tok.strip() else tok for tok in tokens) | |
| fixed = re.sub(r'\{\{([^{}]+)\}\}', r'{\1}', fixed) | |
| return fixed | |
| for field in ["title", "booktitle", "journal"]: | |
| pattern = rf'({field}\s*=\s*)' | |
| matches = list(re.finditer(pattern, bibtex, re.IGNORECASE)) | |
| for match in reversed(matches): | |
| field_start = match.end() | |
| value, value_end = scan_brace_balanced_value(bibtex, field_start) | |
| if value: | |
| processed = process_field_value(value) | |
| new_field = f"{match.group(1)}{{{processed}}}" | |
| bibtex = bibtex[:match.start()] + new_field + bibtex[value_end:] | |
| return bibtex | |
| def replace_bibtex_key(bibtex, new_key): | |
| """Replace the citation key in a BibTeX entry""" | |
| if not bibtex: | |
| return bibtex | |
| try: | |
| start_brace = bibtex.index("{") | |
| first_comma = bibtex.index(",", start_brace) | |
| entry_type = bibtex[:start_brace] | |
| new_start = f"{entry_type}{{{new_key}," | |
| return new_start + bibtex[first_comma+1:] | |
| except ValueError: | |
| return bibtex | |
| def enrich_with_crossref(df): | |
| """Enrich references with Crossref data""" | |
| enriched_rows = [] | |
| for idx, row in df.iterrows(): | |
| enriched_data = dict(row) | |
| if not row.get('Title'): | |
| enriched_data['Crossref_BibTeX'] = row.get('BibTeX', '') | |
| enriched_data['Title_Similarity'] = 0 | |
| enriched_rows.append(enriched_data) | |
| continue | |
| query_parts = [row['Title']] | |
| if row.get('Authors'): | |
| query_parts.append(row['Authors'].split(',')[0]) | |
| if row.get('Journal/Booktitle'): | |
| query_parts.append(row['Journal/Booktitle']) | |
| if row.get('Year'): | |
| query_parts.append(row['Year']) | |
| query = " ".join(query_parts) | |
| try: | |
| url = f"https://api.crossref.org/works?query.bibliographic={requests.utils.quote(query)}&rows=3" | |
| response = HTTP.get(url, timeout=15) | |
| response.raise_for_status() | |
| items = response.json().get("message", {}).get("items", []) | |
| best_score = 0 | |
| crossref_bibtex = row.get('BibTeX', '') | |
| best_doi = row.get('DOI', '') | |
| for item in items: | |
| cr_title = item.get("title", [""])[0] | |
| score = SequenceMatcher(None, row['Title'].lower(), cr_title.lower()).ratio() | |
| if row.get('Year') and 'published-print' in item: | |
| cr_year = str(item['published-print'].get('date-parts', [['']])[0][0]) | |
| if row['Year'].strip() == cr_year: | |
| score = min(1.0, score + 0.1) | |
| if score > best_score: | |
| best_score = score | |
| best_doi = item.get('DOI', best_doi) | |
| if best_doi: | |
| try: | |
| bibtex_response = HTTP.get( | |
| f"https://doi.org/{best_doi}", | |
| headers={"Accept": "application/x-bibtex"}, | |
| timeout=15 | |
| ) | |
| if bibtex_response.status_code == 200: | |
| crossref_bibtex = bibtex_response.text.strip() | |
| except Exception as e: | |
| print(f"β οΈ BibTeX fetch failed for DOI {best_doi}: {e}") | |
| enriched_data['Crossref_BibTeX'] = crossref_bibtex if best_score >= 0.85 else row.get('BibTeX', '') | |
| enriched_data['Title_Similarity'] = int(round(best_score * 100)) | |
| if best_doi: | |
| enriched_data['DOI'] = best_doi | |
| except Exception as e: | |
| print(f"β οΈ Crossref enrichment failed: {e}") | |
| enriched_data['Crossref_BibTeX'] = row.get('BibTeX', '') | |
| enriched_data['Title_Similarity'] = 0 | |
| time.sleep(0.15 + random.uniform(0, 0.25)) | |
| enriched_rows.append(enriched_data) | |
| return pd.DataFrame(enriched_rows) | |
| def add_journal_abbreviations(df): | |
| """Add journal abbreviations and create all BibTeX versions""" | |
| abbreviated_rows = [] | |
| for idx, row in df.iterrows(): | |
| journal = row.get('Journal/Booktitle', '') | |
| journal_abbrev = abbreviate_journal_custom(journal) | |
| row_data = dict(row) | |
| row_data['Journal_Abbreviation'] = journal_abbrev | |
| # Create LocalKey version | |
| key_to_use = row_data.get('Key') or row_data.get('Reference') or f"ref_{idx}" | |
| if row_data.get('Crossref_BibTeX'): | |
| row_data['Crossref_BibTeX_LocalKey'] = replace_bibtex_key( | |
| row_data['Crossref_BibTeX'], | |
| key_to_use | |
| ) | |
| else: | |
| row_data['Crossref_BibTeX_LocalKey'] = row_data.get('BibTeX', '') | |
| # Create abbreviated version | |
| if journal_abbrev and row_data.get('Crossref_BibTeX_LocalKey'): | |
| new_bib = row_data['Crossref_BibTeX_LocalKey'].strip() | |
| new_bib = re.sub( | |
| r'(journal\s*=\s*\{)[^}]+(\})', | |
| rf'\1{journal_abbrev}\2', | |
| new_bib, | |
| flags=re.IGNORECASE | |
| ) | |
| row_data['Crossref_BibTeX_Abbrev'] = new_bib | |
| else: | |
| row_data['Crossref_BibTeX_Abbrev'] = row_data.get('Crossref_BibTeX_LocalKey', row_data.get('BibTeX', '')) | |
| # Create protected version | |
| row_data['Crossref_BibTeX_Protected'] = protect_acronyms_in_fields( | |
| row_data.get('Crossref_BibTeX_Abbrev', row_data.get('BibTeX', '')) | |
| ) | |
| # Clean all versions | |
| for bib_col in ['BibTeX', 'Crossref_BibTeX', 'Crossref_BibTeX_LocalKey', | |
| 'Crossref_BibTeX_Abbrev', 'Crossref_BibTeX_Protected']: | |
| if row_data.get(bib_col): | |
| row_data[bib_col] = clean_bibtex_fields(row_data[bib_col]) | |
| abbreviated_rows.append(row_data) | |
| return pd.DataFrame(abbreviated_rows) | |
| # ===================================================================== | |
| # ROUTES | |
| # ===================================================================== | |
| def index(): | |
| """Main page""" | |
| return render_template('index.html') | |
| def process_bibtex(): | |
| """Process BibTeX content with optional LaTeX analysis""" | |
| if not check_api_key(): | |
| return jsonify({'error': 'Unauthorized'}), 401 | |
| try: | |
| # Handle both JSON and FormData | |
| if request.is_json: | |
| data = request.get_json() | |
| bibtex_content = data.get('bibtex_content') or data.get('bibtex', '') | |
| input_mode = data.get('input_mode', 'bibtex') | |
| enrich = data.get('enrich', False) | |
| abbreviate = data.get('abbreviate', False) | |
| protect = data.get('protect', False) | |
| save_to_db = data.get('save_to_db', False) | |
| latex_file = None | |
| else: | |
| # FormData from file upload | |
| bibtex_content = request.form.get('bibtex_content', '') | |
| input_mode = request.form.get('input_mode', 'bibtex') | |
| enrich = request.form.get('enrich', 'false').lower() == 'true' | |
| abbreviate = request.form.get('abbreviate', 'false').lower() == 'true' | |
| protect = request.form.get('protect', 'false').lower() == 'true' | |
| save_to_db = request.form.get('save_to_db', 'false').lower() == 'true' | |
| latex_file = request.files.get('latex_file') | |
| print(f"π₯ Received: {len(bibtex_content)} chars, mode={input_mode}") | |
| # TITLE MODE: Search Crossref for each title | |
| if input_mode == 'title': | |
| titles = [line.strip() for line in bibtex_content.split('\n') if line.strip()] | |
| print(f"π Title mode: searching for {len(titles)} titles") | |
| if not titles: | |
| return jsonify({'error': 'No titles provided'}), 400 | |
| # Convert titles to BibTeX entries via Crossref | |
| bibtex_entries = [] | |
| for i, title in enumerate(titles, 1): | |
| print(f" [{i}/{len(titles)}] Searching: {title[:50]}...") | |
| try: | |
| url = f"https://api.crossref.org/works?query.bibliographic={requests.utils.quote(title)}&rows=1" | |
| response = HTTP.get(url, timeout=15) | |
| items = response.json().get("message", {}).get("items", []) | |
| if items and items[0].get('DOI'): | |
| doi = items[0]['DOI'] | |
| bibtex_r = HTTP.get( | |
| f"https://doi.org/{doi}", | |
| headers={"Accept": "application/x-bibtex"}, | |
| timeout=15 | |
| ) | |
| if bibtex_r.status_code == 200: | |
| bibtex_entries.append(bibtex_r.text.strip()) | |
| print(f" β Found via DOI: {doi}") | |
| time.sleep(random.uniform(1, 2)) | |
| except Exception as e: | |
| print(f" β οΈ Failed: {e}") | |
| bibtex_content = '\n\n'.join(bibtex_entries) | |
| print(f"β Retrieved {len(bibtex_entries)} BibTeX entries from titles") | |
| if not bibtex_content: | |
| return jsonify({'error': 'No BibTeX entries found for the provided titles'}), 400 | |
| if not bibtex_content or len(bibtex_content.strip()) == 0: | |
| return jsonify({'error': 'No BibTeX content provided'}), 400 | |
| # Parse BibTeX | |
| df = parse_bibtex_input(bibtex_content) | |
| if df.empty: | |
| return jsonify({'error': 'No valid BibTeX entries found'}), 400 | |
| # Parse LaTeX file if provided | |
| latex_analyzed = False | |
| citations_found = 0 | |
| if latex_file: | |
| print("π LaTeX file provided, analyzing...") | |
| try: | |
| latex_content = latex_file.read().decode('utf-8') | |
| citations_df = parse_citations_from_tex(latex_content) | |
| citations_found = len(citations_df) | |
| # Merge LaTeX citation data with BibTeX data | |
| df = merge_citations_with_bib(citations_df, df) | |
| df.insert(0, "Index", range(1, len(df) + 1)) | |
| latex_analyzed = True | |
| print(f"β LaTeX analyzed: {citations_found} citations found") | |
| except Exception as e: | |
| print(f"β οΈ LaTeX analysis failed: {e}") | |
| # Enrich if requested | |
| if enrich: | |
| df = enrich_with_crossref(df) | |
| else: | |
| df['Crossref_BibTeX'] = df['BibTeX'] | |
| df['Title_Similarity'] = 0 | |
| # Always create all versions | |
| df = add_journal_abbreviations(df) | |
| # Save to database if requested | |
| db_id = None | |
| if save_to_db: | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| session_id = datetime.now().isoformat() | |
| for _, row in df.iterrows(): | |
| doi = row.get('DOI', '').strip() | |
| year_int = extract_year_int(row.get('Year', '')) | |
| key_val = row.get('Reference') or row.get('Key', f"ref_{row.name}") | |
| try: | |
| cursor.execute(''' | |
| INSERT OR REPLACE INTO bibliography | |
| (index_num, session_id, reference, frequency, sections, key, doi, type, | |
| authors, title, journal_booktitle, year, year_int, publisher, volume, pages, | |
| bibtex, crossref_bibtex, crossref_bibtex_localkey, title_similarity, | |
| journal_abbreviation, crossref_bibtex_abbrev, crossref_bibtex_protected, | |
| imported_date) | |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| ''', ( | |
| row.get('Index'), session_id, row.get('Reference', key_val), | |
| row.get('Frequency', 0), row.get('Sections', ''), | |
| key_val, doi, row.get('Type', ''), row.get('Authors', ''), | |
| row.get('Title', ''), row.get('Journal/Booktitle', ''), | |
| row.get('Year', ''), year_int, row.get('Publisher', ''), | |
| row.get('Volume', ''), row.get('Pages', ''), | |
| row.get('BibTeX', ''), row.get('Crossref_BibTeX', ''), | |
| row.get('Crossref_BibTeX_LocalKey', ''), | |
| row.get('Title_Similarity', 0), row.get('Journal_Abbreviation', ''), | |
| row.get('Crossref_BibTeX_Abbrev', ''), | |
| row.get('Crossref_BibTeX_Protected', ''), | |
| datetime.now().isoformat() | |
| )) | |
| except sqlite3.IntegrityError as e: | |
| print(f"β οΈ DB insert failed for {key_val}: {e}") | |
| conn.commit() | |
| db_id = session_id | |
| conn.close() | |
| # Determine final BibTeX column | |
| if protect: | |
| final_bibtex_col = 'Crossref_BibTeX_Protected' | |
| elif abbreviate: | |
| final_bibtex_col = 'Crossref_BibTeX_Abbrev' | |
| else: | |
| final_bibtex_col = 'Crossref_BibTeX_LocalKey' | |
| # Prepare response columns | |
| response_cols = ['Key', 'Type', 'Authors', 'Title', 'Journal/Booktitle', 'Year', final_bibtex_col] | |
| if latex_analyzed: | |
| response_cols.insert(6, 'Frequency') | |
| response_cols.insert(7, 'Sections') | |
| response_df = df[[col for col in response_cols if col in df.columns]].copy() | |
| response_df.columns = list(response_df.columns[:-1]) + ['Final_BibTeX'] | |
| return jsonify({ | |
| 'success': True, | |
| 'count': len(df), | |
| 'db_id': db_id, | |
| 'latex_analyzed': latex_analyzed, | |
| 'citations_found': citations_found, | |
| 'data': response_df.to_dict(orient='records'), | |
| 'full_data': df.to_dict(orient='records') | |
| }) | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return jsonify({'error': str(e)}), 500 | |
| def list_sections(): | |
| """Get list of all unique sections from database""" | |
| try: | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| cursor.execute('SELECT DISTINCT sections FROM bibliography WHERE sections IS NOT NULL AND sections != ""') | |
| rows = cursor.fetchall() | |
| conn.close() | |
| # Parse and deduplicate sections | |
| all_sections = set() | |
| for row in rows: | |
| if row[0]: | |
| sections = [s.strip() for s in row[0].split(',')] | |
| all_sections.update(sections) | |
| sections_list = sorted(list(all_sections)) | |
| return jsonify({ | |
| 'success': True, | |
| 'sections': sections_list | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def get_references_by_section(): | |
| """Get all references for a specific section""" | |
| section = request.args.get('section', '') | |
| if not section: | |
| return jsonify({'error': 'Section parameter required'}), 400 | |
| try: | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| # Find all references that contain this section | |
| cursor.execute(''' | |
| SELECT key, title, authors, year, frequency, sections, reference | |
| FROM bibliography | |
| WHERE sections LIKE ? | |
| ''', (f'%{section}%',)) | |
| rows = cursor.fetchall() | |
| conn.close() | |
| references = [] | |
| for row in rows: | |
| # Calculate frequency in this specific section | |
| sections_list = [s.strip() for s in row[5].split(',') if s.strip()] | |
| if section in sections_list: | |
| # Count occurrences in this section (simplified - assumes equal distribution) | |
| total_freq = row[4] or 0 | |
| num_sections = len(sections_list) | |
| freq_in_section = total_freq // num_sections if num_sections > 0 else total_freq | |
| references.append({ | |
| 'key': row[0], | |
| 'title': row[1], | |
| 'authors': row[2], | |
| 'year': row[3], | |
| 'frequency_in_section': freq_in_section, | |
| 'total_frequency': total_freq, | |
| 'all_sections': row[5] | |
| }) | |
| return jsonify({ | |
| 'success': True, | |
| 'section': section, | |
| 'references': references | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def get_database_entries(): | |
| """Get all entries from database""" | |
| try: | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| cursor.execute(''' | |
| SELECT * FROM bibliography ORDER BY created_at DESC LIMIT 100 | |
| ''') | |
| columns = [description[0] for description in cursor.description] | |
| entries = [dict(zip(columns, row)) for row in cursor.fetchall()] | |
| conn.close() | |
| return jsonify({ | |
| 'success': True, | |
| 'count': len(entries), | |
| 'entries': entries | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def delete_entry(key): | |
| """Delete entry from database""" | |
| if not check_api_key(): | |
| return jsonify({'error': 'Unauthorized'}), 401 | |
| try: | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| cursor.execute('DELETE FROM bibliography WHERE key=?', (key,)) | |
| conn.commit() | |
| conn.close() | |
| return jsonify({'success': True}) | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def clear_database(): | |
| """Clear all entries from database""" | |
| if not check_api_key(): | |
| return jsonify({'error': 'Unauthorized'}), 401 | |
| try: | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| cursor.execute('DELETE FROM bibliography') | |
| deleted_count = cursor.rowcount | |
| cursor.execute('DELETE FROM sqlite_sequence WHERE name="bibliography"') | |
| conn.commit() | |
| conn.close() | |
| print(f"ποΈ Cleared {deleted_count} entries from database") | |
| return jsonify({ | |
| 'success': True, | |
| 'deleted_count': deleted_count, | |
| 'message': f'Successfully cleared {deleted_count} entries from database' | |
| }) | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return jsonify({'error': str(e)}), 500 | |
| def export_database(): | |
| """Export database as CSV""" | |
| try: | |
| conn = get_db_connection() | |
| df = pd.read_sql_query('SELECT * FROM bibliography', conn) | |
| conn.close() | |
| output = io.StringIO() | |
| df.to_csv(output, index=False) | |
| output.seek(0) | |
| return send_file( | |
| io.BytesIO(output.getvalue().encode()), | |
| mimetype='text/csv', | |
| as_attachment=True, | |
| download_name='references_export.csv' | |
| ) | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def export_bibtex(): | |
| """Export database as BibTeX""" | |
| try: | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| cursor.execute('SELECT key, crossref_bibtex_protected FROM bibliography ORDER BY key') | |
| bibtex_content = '\n\n'.join([row[1] for row in cursor.fetchall() if row[1]]) | |
| conn.close() | |
| return send_file( | |
| io.BytesIO(bibtex_content.encode()), | |
| mimetype='text/plain', | |
| as_attachment=True, | |
| download_name='references.bib' | |
| ) | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def download_database(): | |
| """Download entire database file""" | |
| try: | |
| return send_file( | |
| app.config['DATABASE'], | |
| mimetype='application/x-sqlite3', | |
| as_attachment=True, | |
| download_name='refs_management.db' | |
| ) | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def get_stats(): | |
| """Get database statistics""" | |
| try: | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| cursor.execute('SELECT COUNT(*) FROM bibliography') | |
| total = cursor.fetchone()[0] | |
| cursor.execute('SELECT COUNT(DISTINCT type) FROM bibliography') | |
| types = cursor.fetchone()[0] | |
| cursor.execute('SELECT COUNT(DISTINCT year_int) FROM bibliography WHERE year_int IS NOT NULL') | |
| years = cursor.fetchone()[0] | |
| conn.close() | |
| return jsonify({ | |
| 'total_entries': total, | |
| 'entry_types': types, | |
| 'unique_years': years | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| if __name__ == '__main__': | |
| init_db() | |
| app.run(debug=False, host='0.0.0.0', port=7860) |