Spaces:

Gurusha
/

harc

Runtime error

File size: 13,262 Bytes

94eec12

"""HaRC - Hallucinated Reference Checker (Hugging Face Spaces version)."""

import re
import tempfile
from pathlib import Path

import gradio as gr
import pymupdf  # PyMuPDF

from reference_checker import check_citations


def extract_references_section(text: str) -> str:
    """Extract the references/bibliography section from paper text."""
    # Common section headers for references
    patterns = [
        r'\n\s*References\s*\n',
        r'\n\s*REFERENCES\s*\n',
        r'\n\s*Bibliography\s*\n',
        r'\n\s*BIBLIOGRAPHY\s*\n',
        r'\n\s*Works Cited\s*\n',
        r'\n\s*Literature Cited\s*\n',
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return text[match.end():]

    # If no header found, return last 30% of document (often contains refs)
    return text[int(len(text) * 0.7):]


def parse_references_from_text(text: str) -> list[dict]:
    """Parse individual references from extracted text.

    Uses heuristics to identify reference boundaries and extract metadata.
    """
    references = []

    # Clean up text
    text = re.sub(r'\s+', ' ', text)

    # Try to split by common reference patterns
    # Pattern 1: [1], [2], etc.
    numbered_refs = re.split(r'\[\d+\]\s*', text)
    if len(numbered_refs) > 3:
        refs_list = [r.strip() for r in numbered_refs if r.strip()]
    else:
        # Pattern 2: 1. 2. 3. etc at start of line
        refs_list = re.split(r'(?:^|\n)\d+\.\s+', text)
        refs_list = [r.strip() for r in refs_list if r.strip()]

    if len(refs_list) < 3:
        # Pattern 3: Split by author name patterns (Name, Initial.)
        refs_list = re.split(r'(?<=[.?!])\s+(?=[A-Z][a-z]+,?\s+[A-Z]\.)', text)
        refs_list = [r.strip() for r in refs_list if r.strip() and len(r) > 30]

    for ref_text in refs_list[:100]:  # Limit to 100 refs
        ref = parse_single_reference(ref_text)
        if ref and ref.get('title'):
            references.append(ref)

    return references


def parse_single_reference(text: str) -> dict | None:
    """Parse a single reference string into structured data."""
    if len(text) < 20:
        return None

    ref = {}

    # Extract year (4 digits, typically 1900-2099)
    year_match = re.search(r'\b(19|20)\d{2}\b', text)
    if year_match:
        ref['year'] = year_match.group()

    # Extract DOI if present
    doi_match = re.search(r'10\.\d{4,}/[^\s]+', text)
    if doi_match:
        ref['doi'] = doi_match.group().rstrip('.')

    # Extract arXiv ID if present
    arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', text, re.IGNORECASE)
    if arxiv_match:
        ref['arxiv'] = arxiv_match.group(1)

    # Try to extract title (usually in quotes or after authors, before journal)
    # Pattern: Look for text in quotes
    title_match = re.search(r'["\u201c]([^"\u201d]+)["\u201d]', text)
    if title_match:
        ref['title'] = title_match.group(1).strip()
    else:
        # Heuristic: title is often after year and authors, before journal/venue
        # Take a reasonable chunk after the year
        if year_match:
            after_year = text[year_match.end():].strip()
            # Remove leading punctuation
            after_year = re.sub(r'^[.,)\]]\s*', '', after_year)
            # Take first sentence-like chunk
            title_candidate = re.split(r'[.!?]', after_year)[0].strip()
            if 10 < len(title_candidate) < 200:
                ref['title'] = title_candidate

    # If still no title, try beginning of text (before year)
    if not ref.get('title') and year_match:
        before_year = text[:year_match.start()].strip()
        # Look for the last comma-separated segment before year as potential title
        parts = before_year.rsplit('.', 1)
        if len(parts) > 1 and len(parts[-1].strip()) > 10:
            ref['title'] = parts[-1].strip()

    # Extract authors (usually at the beginning)
    if year_match:
        author_text = text[:year_match.start()].strip()
        # Clean up and extract author names
        author_text = re.sub(r'[,.]$', '', author_text)
        if author_text and len(author_text) < 500:
            # Split by 'and' or comma
            author_parts = re.split(r'\s+and\s+|,\s*', author_text)
            authors = []
            for part in author_parts:
                part = part.strip()
                # Filter out non-name parts
                if part and len(part) > 2 and not part.isdigit():
                    # Check if it looks like a name (has capital letter)
                    if re.search(r'[A-Z]', part):
                        authors.append(part)
            if authors:
                ref['authors'] = authors[:10]  # Limit to 10 authors

    return ref if ref.get('title') else None


def references_to_bibtex(references: list[dict]) -> str:
    """Convert references to BibTeX format."""
    entries = []

    for i, ref in enumerate(references):
        key = f"ref{i+1}"
        entry_type = "article"

        fields = []
        if ref.get('title'):
            # Escape special characters
            title = ref['title'].replace('{', '\\{').replace('}', '\\}')
            fields.append(f'  title = {{{title}}}')
        if ref.get('authors'):
            authors_str = ' and '.join(ref['authors'])
            fields.append(f'  author = {{{authors_str}}}')
        if ref.get('year'):
            fields.append(f'  year = {{{ref["year"]}}}')
        if ref.get('doi'):
            fields.append(f'  doi = {{{ref["doi"]}}}')
        if ref.get('arxiv'):
            fields.append(f'  eprint = {{{ref["arxiv"]}}}')
            fields.append('  archiveprefix = {arXiv}')

        if fields:
            entry = f"@{entry_type}{{{key},\n"
            entry += ",\n".join(fields)
            entry += "\n}"
            entries.append(entry)

    return "\n\n".join(entries)


def process_pdf(pdf_file) -> tuple[str, str, str]:
    """Process uploaded PDF and check references.

    Returns: (summary, issues_text, verified_text)
    """
    if pdf_file is None:
        return "Please upload a PDF file.", "", ""

    try:
        # Extract text from PDF
        doc = pymupdf.open(pdf_file.name)
        full_text = ""
        for page in doc:
            full_text += page.get_text()
        doc.close()

        if not full_text.strip():
            return "Could not extract text from PDF. The file might be scanned/image-based.", "", ""

        # Extract references section
        refs_text = extract_references_section(full_text)

        # Parse references
        references = parse_references_from_text(refs_text)

        if not references:
            return "No references could be extracted from the PDF.", "", ""

        # Convert to BibTeX
        bibtex = references_to_bibtex(references)

        # Save to temp file and check
        with tempfile.NamedTemporaryFile(mode='w', suffix='.bib', delete=False) as f:
            f.write(bibtex)
            bib_path = f.name

        try:
            issues = check_citations(bib_path, verbose=False)
            issue_keys = {r.entry.key for r in issues}
        finally:
            Path(bib_path).unlink(missing_ok=True)

        # Build results
        verified = []
        problems = []

        for i, ref in enumerate(references):
            key = f"ref{i+1}"
            title = ref.get('title', 'Unknown')
            authors = ', '.join(ref.get('authors', [])[:3])
            if len(ref.get('authors', [])) > 3:
                authors += ' et al.'
            year = ref.get('year', '')

            if key in issue_keys:
                issue = next(r for r in issues if r.entry.key == key)
                problems.append(f"**{title}**\n  {authors} ({year})\n  *Issue: {issue.message}*")
            else:
                verified.append(f"**{title}**\n  {authors} ({year})")

        # Summary
        total = len(references)
        verified_count = len(verified)
        issues_count = len(problems)

        summary = f"## Results\n\n"
        summary += f"- **Total references found:** {total}\n"
        summary += f"- **Verified:** {verified_count}\n"
        summary += f"- **Issues found:** {issues_count}\n"

        if issues_count == 0:
            summary += "\n All references verified successfully!"
        elif issues_count > total * 0.5:
            summary += "\n Many issues found - some may be due to parsing errors."

        issues_text = "\n\n".join(problems) if problems else "No issues found!"
        verified_text = "\n\n".join(verified) if verified else "No verified references."

        return summary, issues_text, verified_text

    except Exception as e:
        return f"Error processing PDF: {str(e)}", "", ""


def process_bibtex(bibtex_text: str) -> tuple[str, str, str]:
    """Process pasted BibTeX and check references."""
    if not bibtex_text.strip():
        return "Please paste your BibTeX content.", "", ""

    try:
        # Save to temp file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.bib', delete=False) as f:
            f.write(bibtex_text)
            bib_path = f.name

        try:
            from reference_checker.parser import parse_bib_file
            entries = parse_bib_file(bib_path)
            issues = check_citations(bib_path, verbose=False)
            issue_keys = {r.entry.key for r in issues}
        finally:
            Path(bib_path).unlink(missing_ok=True)

        # Build results
        verified = []
        problems = []

        for entry in entries:
            authors = ', '.join(entry.authors[:3])
            if len(entry.authors) > 3:
                authors += ' et al.'

            if entry.key in issue_keys:
                issue = next(r for r in issues if r.entry.key == entry.key)
                problems.append(f"**[{entry.key}] {entry.title}**\n  {authors} ({entry.year})\n  *Issue: {issue.message}*")
            else:
                verified.append(f"**[{entry.key}] {entry.title}**\n  {authors} ({entry.year})")

        # Summary
        total = len(entries)
        verified_count = len(verified)
        issues_count = len(problems)

        summary = f"## Results\n\n"
        summary += f"- **Total entries:** {total}\n"
        summary += f"- **Verified:** {verified_count}\n"
        summary += f"- **Issues found:** {issues_count}\n"

        if issues_count == 0:
            summary += "\n All references verified successfully!"

        issues_text = "\n\n".join(problems) if problems else "No issues found!"
        verified_text = "\n\n".join(verified) if verified else "No verified references."

        return summary, issues_text, verified_text

    except Exception as e:
        return f"Error processing BibTeX: {str(e)}", "", ""


# Build Gradio interface
with gr.Blocks(
    title="HaRC - Hallucinated Reference Checker",
    theme=gr.themes.Soft(primary_hue="purple"),
) as demo:
    gr.Markdown("""
    # HaRC - Hallucinated Reference Checker

    Verify your paper's references against academic databases.
    Catches fake, misspelled, or incorrect citations before submission.

    **Checks against:** Semantic Scholar, DBLP, Google Scholar, Open Library
    """)

    with gr.Tabs():
        with gr.TabItem("Upload PDF"):
            gr.Markdown("Upload your paper and we'll extract and verify the references.")
            pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
            pdf_button = gr.Button("Check References", variant="primary")

            with gr.Row():
                pdf_summary = gr.Markdown(label="Summary")

            with gr.Row():
                with gr.Column():
                    pdf_issues = gr.Markdown(label="Issues Found")
                with gr.Column():
                    pdf_verified = gr.Markdown(label="Verified References")

            pdf_button.click(
                fn=process_pdf,
                inputs=[pdf_input],
                outputs=[pdf_summary, pdf_issues, pdf_verified],
            )

        with gr.TabItem("Paste BibTeX"):
            gr.Markdown("Paste your `.bib` file contents directly.")
            bib_input = gr.Textbox(
                label="BibTeX Content",
                placeholder="@article{example2023,\n  title = {Example Paper},\n  author = {John Doe},\n  year = {2023}\n}",
                lines=10,
            )
            bib_button = gr.Button("Check References", variant="primary")

            with gr.Row():
                bib_summary = gr.Markdown(label="Summary")

            with gr.Row():
                with gr.Column():
                    bib_issues = gr.Markdown(label="Issues Found")
                with gr.Column():
                    bib_verified = gr.Markdown(label="Verified References")

            bib_button.click(
                fn=process_bibtex,
                inputs=[bib_input],
                outputs=[bib_summary, bib_issues, bib_verified],
            )

    gr.Markdown("""
    ---
    **Note:** PDF reference extraction uses heuristics and may not be 100% accurate.
    For best results, use the BibTeX tab with your actual `.bib` file.

    [GitHub](https://github.com/gurusha01/HaRC) | [PyPI](https://pypi.org/project/harcx/)
    """)


if __name__ == "__main__":
    demo.launch()