"""HaRC - Hallucinated Reference Checker (Hugging Face Spaces version).""" import re import tempfile from pathlib import Path import gradio as gr import pymupdf # PyMuPDF from reference_checker import check_citations def extract_references_section(text: str) -> str: """Extract the references/bibliography section from paper text.""" # Common section headers for references patterns = [ r'\n\s*References\s*\n', r'\n\s*REFERENCES\s*\n', r'\n\s*Bibliography\s*\n', r'\n\s*BIBLIOGRAPHY\s*\n', r'\n\s*Works Cited\s*\n', r'\n\s*Literature Cited\s*\n', ] for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE) if match: return text[match.end():] # If no header found, return last 30% of document (often contains refs) return text[int(len(text) * 0.7):] def parse_references_from_text(text: str) -> list[dict]: """Parse individual references from extracted text. Uses heuristics to identify reference boundaries and extract metadata. """ references = [] # Clean up text text = re.sub(r'\s+', ' ', text) # Try to split by common reference patterns # Pattern 1: [1], [2], etc. numbered_refs = re.split(r'\[\d+\]\s*', text) if len(numbered_refs) > 3: refs_list = [r.strip() for r in numbered_refs if r.strip()] else: # Pattern 2: 1. 2. 3. etc at start of line refs_list = re.split(r'(?:^|\n)\d+\.\s+', text) refs_list = [r.strip() for r in refs_list if r.strip()] if len(refs_list) < 3: # Pattern 3: Split by author name patterns (Name, Initial.) refs_list = re.split(r'(?<=[.?!])\s+(?=[A-Z][a-z]+,?\s+[A-Z]\.)', text) refs_list = [r.strip() for r in refs_list if r.strip() and len(r) > 30] for ref_text in refs_list[:100]: # Limit to 100 refs ref = parse_single_reference(ref_text) if ref and ref.get('title'): references.append(ref) return references def parse_single_reference(text: str) -> dict | None: """Parse a single reference string into structured data.""" if len(text) < 20: return None ref = {} # Extract year (4 digits, typically 1900-2099) year_match = re.search(r'\b(19|20)\d{2}\b', text) if year_match: ref['year'] = year_match.group() # Extract DOI if present doi_match = re.search(r'10\.\d{4,}/[^\s]+', text) if doi_match: ref['doi'] = doi_match.group().rstrip('.') # Extract arXiv ID if present arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', text, re.IGNORECASE) if arxiv_match: ref['arxiv'] = arxiv_match.group(1) # Try to extract title (usually in quotes or after authors, before journal) # Pattern: Look for text in quotes title_match = re.search(r'["\u201c]([^"\u201d]+)["\u201d]', text) if title_match: ref['title'] = title_match.group(1).strip() else: # Heuristic: title is often after year and authors, before journal/venue # Take a reasonable chunk after the year if year_match: after_year = text[year_match.end():].strip() # Remove leading punctuation after_year = re.sub(r'^[.,)\]]\s*', '', after_year) # Take first sentence-like chunk title_candidate = re.split(r'[.!?]', after_year)[0].strip() if 10 < len(title_candidate) < 200: ref['title'] = title_candidate # If still no title, try beginning of text (before year) if not ref.get('title') and year_match: before_year = text[:year_match.start()].strip() # Look for the last comma-separated segment before year as potential title parts = before_year.rsplit('.', 1) if len(parts) > 1 and len(parts[-1].strip()) > 10: ref['title'] = parts[-1].strip() # Extract authors (usually at the beginning) if year_match: author_text = text[:year_match.start()].strip() # Clean up and extract author names author_text = re.sub(r'[,.]$', '', author_text) if author_text and len(author_text) < 500: # Split by 'and' or comma author_parts = re.split(r'\s+and\s+|,\s*', author_text) authors = [] for part in author_parts: part = part.strip() # Filter out non-name parts if part and len(part) > 2 and not part.isdigit(): # Check if it looks like a name (has capital letter) if re.search(r'[A-Z]', part): authors.append(part) if authors: ref['authors'] = authors[:10] # Limit to 10 authors return ref if ref.get('title') else None def references_to_bibtex(references: list[dict]) -> str: """Convert references to BibTeX format.""" entries = [] for i, ref in enumerate(references): key = f"ref{i+1}" entry_type = "article" fields = [] if ref.get('title'): # Escape special characters title = ref['title'].replace('{', '\\{').replace('}', '\\}') fields.append(f' title = {{{title}}}') if ref.get('authors'): authors_str = ' and '.join(ref['authors']) fields.append(f' author = {{{authors_str}}}') if ref.get('year'): fields.append(f' year = {{{ref["year"]}}}') if ref.get('doi'): fields.append(f' doi = {{{ref["doi"]}}}') if ref.get('arxiv'): fields.append(f' eprint = {{{ref["arxiv"]}}}') fields.append(' archiveprefix = {arXiv}') if fields: entry = f"@{entry_type}{{{key},\n" entry += ",\n".join(fields) entry += "\n}" entries.append(entry) return "\n\n".join(entries) def process_pdf(pdf_file) -> tuple[str, str, str]: """Process uploaded PDF and check references. Returns: (summary, issues_text, verified_text) """ if pdf_file is None: return "Please upload a PDF file.", "", "" try: # Extract text from PDF doc = pymupdf.open(pdf_file.name) full_text = "" for page in doc: full_text += page.get_text() doc.close() if not full_text.strip(): return "Could not extract text from PDF. The file might be scanned/image-based.", "", "" # Extract references section refs_text = extract_references_section(full_text) # Parse references references = parse_references_from_text(refs_text) if not references: return "No references could be extracted from the PDF.", "", "" # Convert to BibTeX bibtex = references_to_bibtex(references) # Save to temp file and check with tempfile.NamedTemporaryFile(mode='w', suffix='.bib', delete=False) as f: f.write(bibtex) bib_path = f.name try: issues = check_citations(bib_path, verbose=False) issue_keys = {r.entry.key for r in issues} finally: Path(bib_path).unlink(missing_ok=True) # Build results verified = [] problems = [] for i, ref in enumerate(references): key = f"ref{i+1}" title = ref.get('title', 'Unknown') authors = ', '.join(ref.get('authors', [])[:3]) if len(ref.get('authors', [])) > 3: authors += ' et al.' year = ref.get('year', '') if key in issue_keys: issue = next(r for r in issues if r.entry.key == key) problems.append(f"**{title}**\n {authors} ({year})\n *Issue: {issue.message}*") else: verified.append(f"**{title}**\n {authors} ({year})") # Summary total = len(references) verified_count = len(verified) issues_count = len(problems) summary = f"## Results\n\n" summary += f"- **Total references found:** {total}\n" summary += f"- **Verified:** {verified_count}\n" summary += f"- **Issues found:** {issues_count}\n" if issues_count == 0: summary += "\n All references verified successfully!" elif issues_count > total * 0.5: summary += "\n Many issues found - some may be due to parsing errors." issues_text = "\n\n".join(problems) if problems else "No issues found!" verified_text = "\n\n".join(verified) if verified else "No verified references." return summary, issues_text, verified_text except Exception as e: return f"Error processing PDF: {str(e)}", "", "" def process_bibtex(bibtex_text: str) -> tuple[str, str, str]: """Process pasted BibTeX and check references.""" if not bibtex_text.strip(): return "Please paste your BibTeX content.", "", "" try: # Save to temp file with tempfile.NamedTemporaryFile(mode='w', suffix='.bib', delete=False) as f: f.write(bibtex_text) bib_path = f.name try: from reference_checker.parser import parse_bib_file entries = parse_bib_file(bib_path) issues = check_citations(bib_path, verbose=False) issue_keys = {r.entry.key for r in issues} finally: Path(bib_path).unlink(missing_ok=True) # Build results verified = [] problems = [] for entry in entries: authors = ', '.join(entry.authors[:3]) if len(entry.authors) > 3: authors += ' et al.' if entry.key in issue_keys: issue = next(r for r in issues if r.entry.key == entry.key) problems.append(f"**[{entry.key}] {entry.title}**\n {authors} ({entry.year})\n *Issue: {issue.message}*") else: verified.append(f"**[{entry.key}] {entry.title}**\n {authors} ({entry.year})") # Summary total = len(entries) verified_count = len(verified) issues_count = len(problems) summary = f"## Results\n\n" summary += f"- **Total entries:** {total}\n" summary += f"- **Verified:** {verified_count}\n" summary += f"- **Issues found:** {issues_count}\n" if issues_count == 0: summary += "\n All references verified successfully!" issues_text = "\n\n".join(problems) if problems else "No issues found!" verified_text = "\n\n".join(verified) if verified else "No verified references." return summary, issues_text, verified_text except Exception as e: return f"Error processing BibTeX: {str(e)}", "", "" # Build Gradio interface with gr.Blocks( title="HaRC - Hallucinated Reference Checker", theme=gr.themes.Soft(primary_hue="purple"), ) as demo: gr.Markdown(""" # HaRC - Hallucinated Reference Checker Verify your paper's references against academic databases. Catches fake, misspelled, or incorrect citations before submission. **Checks against:** Semantic Scholar, DBLP, Google Scholar, Open Library """) with gr.Tabs(): with gr.TabItem("Upload PDF"): gr.Markdown("Upload your paper and we'll extract and verify the references.") pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) pdf_button = gr.Button("Check References", variant="primary") with gr.Row(): pdf_summary = gr.Markdown(label="Summary") with gr.Row(): with gr.Column(): pdf_issues = gr.Markdown(label="Issues Found") with gr.Column(): pdf_verified = gr.Markdown(label="Verified References") pdf_button.click( fn=process_pdf, inputs=[pdf_input], outputs=[pdf_summary, pdf_issues, pdf_verified], ) with gr.TabItem("Paste BibTeX"): gr.Markdown("Paste your `.bib` file contents directly.") bib_input = gr.Textbox( label="BibTeX Content", placeholder="@article{example2023,\n title = {Example Paper},\n author = {John Doe},\n year = {2023}\n}", lines=10, ) bib_button = gr.Button("Check References", variant="primary") with gr.Row(): bib_summary = gr.Markdown(label="Summary") with gr.Row(): with gr.Column(): bib_issues = gr.Markdown(label="Issues Found") with gr.Column(): bib_verified = gr.Markdown(label="Verified References") bib_button.click( fn=process_bibtex, inputs=[bib_input], outputs=[bib_summary, bib_issues, bib_verified], ) gr.Markdown(""" --- **Note:** PDF reference extraction uses heuristics and may not be 100% accurate. For best results, use the BibTeX tab with your actual `.bib` file. [GitHub](https://github.com/gurusha01/HaRC) | [PyPI](https://pypi.org/project/harcx/) """) if __name__ == "__main__": demo.launch()