|
|
"""HaRC - Hallucinated Reference Checker (Hugging Face Spaces version).""" |
|
|
|
|
|
import re |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
|
|
|
import gradio as gr |
|
|
import pymupdf |
|
|
|
|
|
from reference_checker import check_citations |
|
|
|
|
|
|
|
|
def extract_references_section(text: str) -> str: |
|
|
"""Extract the references/bibliography section from paper text.""" |
|
|
|
|
|
patterns = [ |
|
|
r'\n\s*References\s*\n', |
|
|
r'\n\s*REFERENCES\s*\n', |
|
|
r'\n\s*Bibliography\s*\n', |
|
|
r'\n\s*BIBLIOGRAPHY\s*\n', |
|
|
r'\n\s*Works Cited\s*\n', |
|
|
r'\n\s*Literature Cited\s*\n', |
|
|
] |
|
|
|
|
|
for pattern in patterns: |
|
|
match = re.search(pattern, text, re.IGNORECASE) |
|
|
if match: |
|
|
return text[match.end():] |
|
|
|
|
|
|
|
|
return text[int(len(text) * 0.7):] |
|
|
|
|
|
|
|
|
def parse_references_from_text(text: str) -> list[dict]: |
|
|
"""Parse individual references from extracted text. |
|
|
|
|
|
Uses heuristics to identify reference boundaries and extract metadata. |
|
|
""" |
|
|
references = [] |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
|
|
|
|
|
|
numbered_refs = re.split(r'\[\d+\]\s*', text) |
|
|
if len(numbered_refs) > 3: |
|
|
refs_list = [r.strip() for r in numbered_refs if r.strip()] |
|
|
else: |
|
|
|
|
|
refs_list = re.split(r'(?:^|\n)\d+\.\s+', text) |
|
|
refs_list = [r.strip() for r in refs_list if r.strip()] |
|
|
|
|
|
if len(refs_list) < 3: |
|
|
|
|
|
refs_list = re.split(r'(?<=[.?!])\s+(?=[A-Z][a-z]+,?\s+[A-Z]\.)', text) |
|
|
refs_list = [r.strip() for r in refs_list if r.strip() and len(r) > 30] |
|
|
|
|
|
for ref_text in refs_list[:100]: |
|
|
ref = parse_single_reference(ref_text) |
|
|
if ref and ref.get('title'): |
|
|
references.append(ref) |
|
|
|
|
|
return references |
|
|
|
|
|
|
|
|
def parse_single_reference(text: str) -> dict | None: |
|
|
"""Parse a single reference string into structured data.""" |
|
|
if len(text) < 20: |
|
|
return None |
|
|
|
|
|
ref = {} |
|
|
|
|
|
|
|
|
year_match = re.search(r'\b(19|20)\d{2}\b', text) |
|
|
if year_match: |
|
|
ref['year'] = year_match.group() |
|
|
|
|
|
|
|
|
doi_match = re.search(r'10\.\d{4,}/[^\s]+', text) |
|
|
if doi_match: |
|
|
ref['doi'] = doi_match.group().rstrip('.') |
|
|
|
|
|
|
|
|
arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', text, re.IGNORECASE) |
|
|
if arxiv_match: |
|
|
ref['arxiv'] = arxiv_match.group(1) |
|
|
|
|
|
|
|
|
|
|
|
title_match = re.search(r'["\u201c]([^"\u201d]+)["\u201d]', text) |
|
|
if title_match: |
|
|
ref['title'] = title_match.group(1).strip() |
|
|
else: |
|
|
|
|
|
|
|
|
if year_match: |
|
|
after_year = text[year_match.end():].strip() |
|
|
|
|
|
after_year = re.sub(r'^[.,)\]]\s*', '', after_year) |
|
|
|
|
|
title_candidate = re.split(r'[.!?]', after_year)[0].strip() |
|
|
if 10 < len(title_candidate) < 200: |
|
|
ref['title'] = title_candidate |
|
|
|
|
|
|
|
|
if not ref.get('title') and year_match: |
|
|
before_year = text[:year_match.start()].strip() |
|
|
|
|
|
parts = before_year.rsplit('.', 1) |
|
|
if len(parts) > 1 and len(parts[-1].strip()) > 10: |
|
|
ref['title'] = parts[-1].strip() |
|
|
|
|
|
|
|
|
if year_match: |
|
|
author_text = text[:year_match.start()].strip() |
|
|
|
|
|
author_text = re.sub(r'[,.]$', '', author_text) |
|
|
if author_text and len(author_text) < 500: |
|
|
|
|
|
author_parts = re.split(r'\s+and\s+|,\s*', author_text) |
|
|
authors = [] |
|
|
for part in author_parts: |
|
|
part = part.strip() |
|
|
|
|
|
if part and len(part) > 2 and not part.isdigit(): |
|
|
|
|
|
if re.search(r'[A-Z]', part): |
|
|
authors.append(part) |
|
|
if authors: |
|
|
ref['authors'] = authors[:10] |
|
|
|
|
|
return ref if ref.get('title') else None |
|
|
|
|
|
|
|
|
def references_to_bibtex(references: list[dict]) -> str: |
|
|
"""Convert references to BibTeX format.""" |
|
|
entries = [] |
|
|
|
|
|
for i, ref in enumerate(references): |
|
|
key = f"ref{i+1}" |
|
|
entry_type = "article" |
|
|
|
|
|
fields = [] |
|
|
if ref.get('title'): |
|
|
|
|
|
title = ref['title'].replace('{', '\\{').replace('}', '\\}') |
|
|
fields.append(f' title = {{{title}}}') |
|
|
if ref.get('authors'): |
|
|
authors_str = ' and '.join(ref['authors']) |
|
|
fields.append(f' author = {{{authors_str}}}') |
|
|
if ref.get('year'): |
|
|
fields.append(f' year = {{{ref["year"]}}}') |
|
|
if ref.get('doi'): |
|
|
fields.append(f' doi = {{{ref["doi"]}}}') |
|
|
if ref.get('arxiv'): |
|
|
fields.append(f' eprint = {{{ref["arxiv"]}}}') |
|
|
fields.append(' archiveprefix = {arXiv}') |
|
|
|
|
|
if fields: |
|
|
entry = f"@{entry_type}{{{key},\n" |
|
|
entry += ",\n".join(fields) |
|
|
entry += "\n}" |
|
|
entries.append(entry) |
|
|
|
|
|
return "\n\n".join(entries) |
|
|
|
|
|
|
|
|
def process_pdf(pdf_file) -> tuple[str, str, str]: |
|
|
"""Process uploaded PDF and check references. |
|
|
|
|
|
Returns: (summary, issues_text, verified_text) |
|
|
""" |
|
|
if pdf_file is None: |
|
|
return "Please upload a PDF file.", "", "" |
|
|
|
|
|
try: |
|
|
|
|
|
doc = pymupdf.open(pdf_file.name) |
|
|
full_text = "" |
|
|
for page in doc: |
|
|
full_text += page.get_text() |
|
|
doc.close() |
|
|
|
|
|
if not full_text.strip(): |
|
|
return "Could not extract text from PDF. The file might be scanned/image-based.", "", "" |
|
|
|
|
|
|
|
|
refs_text = extract_references_section(full_text) |
|
|
|
|
|
|
|
|
references = parse_references_from_text(refs_text) |
|
|
|
|
|
if not references: |
|
|
return "No references could be extracted from the PDF.", "", "" |
|
|
|
|
|
|
|
|
bibtex = references_to_bibtex(references) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.bib', delete=False) as f: |
|
|
f.write(bibtex) |
|
|
bib_path = f.name |
|
|
|
|
|
try: |
|
|
issues = check_citations(bib_path, verbose=False) |
|
|
issue_keys = {r.entry.key for r in issues} |
|
|
finally: |
|
|
Path(bib_path).unlink(missing_ok=True) |
|
|
|
|
|
|
|
|
verified = [] |
|
|
problems = [] |
|
|
|
|
|
for i, ref in enumerate(references): |
|
|
key = f"ref{i+1}" |
|
|
title = ref.get('title', 'Unknown') |
|
|
authors = ', '.join(ref.get('authors', [])[:3]) |
|
|
if len(ref.get('authors', [])) > 3: |
|
|
authors += ' et al.' |
|
|
year = ref.get('year', '') |
|
|
|
|
|
if key in issue_keys: |
|
|
issue = next(r for r in issues if r.entry.key == key) |
|
|
problems.append(f"**{title}**\n {authors} ({year})\n *Issue: {issue.message}*") |
|
|
else: |
|
|
verified.append(f"**{title}**\n {authors} ({year})") |
|
|
|
|
|
|
|
|
total = len(references) |
|
|
verified_count = len(verified) |
|
|
issues_count = len(problems) |
|
|
|
|
|
summary = f"## Results\n\n" |
|
|
summary += f"- **Total references found:** {total}\n" |
|
|
summary += f"- **Verified:** {verified_count}\n" |
|
|
summary += f"- **Issues found:** {issues_count}\n" |
|
|
|
|
|
if issues_count == 0: |
|
|
summary += "\n All references verified successfully!" |
|
|
elif issues_count > total * 0.5: |
|
|
summary += "\n Many issues found - some may be due to parsing errors." |
|
|
|
|
|
issues_text = "\n\n".join(problems) if problems else "No issues found!" |
|
|
verified_text = "\n\n".join(verified) if verified else "No verified references." |
|
|
|
|
|
return summary, issues_text, verified_text |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error processing PDF: {str(e)}", "", "" |
|
|
|
|
|
|
|
|
def process_bibtex(bibtex_text: str) -> tuple[str, str, str]: |
|
|
"""Process pasted BibTeX and check references.""" |
|
|
if not bibtex_text.strip(): |
|
|
return "Please paste your BibTeX content.", "", "" |
|
|
|
|
|
try: |
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.bib', delete=False) as f: |
|
|
f.write(bibtex_text) |
|
|
bib_path = f.name |
|
|
|
|
|
try: |
|
|
from reference_checker.parser import parse_bib_file |
|
|
entries = parse_bib_file(bib_path) |
|
|
issues = check_citations(bib_path, verbose=False) |
|
|
issue_keys = {r.entry.key for r in issues} |
|
|
finally: |
|
|
Path(bib_path).unlink(missing_ok=True) |
|
|
|
|
|
|
|
|
verified = [] |
|
|
problems = [] |
|
|
|
|
|
for entry in entries: |
|
|
authors = ', '.join(entry.authors[:3]) |
|
|
if len(entry.authors) > 3: |
|
|
authors += ' et al.' |
|
|
|
|
|
if entry.key in issue_keys: |
|
|
issue = next(r for r in issues if r.entry.key == entry.key) |
|
|
problems.append(f"**[{entry.key}] {entry.title}**\n {authors} ({entry.year})\n *Issue: {issue.message}*") |
|
|
else: |
|
|
verified.append(f"**[{entry.key}] {entry.title}**\n {authors} ({entry.year})") |
|
|
|
|
|
|
|
|
total = len(entries) |
|
|
verified_count = len(verified) |
|
|
issues_count = len(problems) |
|
|
|
|
|
summary = f"## Results\n\n" |
|
|
summary += f"- **Total entries:** {total}\n" |
|
|
summary += f"- **Verified:** {verified_count}\n" |
|
|
summary += f"- **Issues found:** {issues_count}\n" |
|
|
|
|
|
if issues_count == 0: |
|
|
summary += "\n All references verified successfully!" |
|
|
|
|
|
issues_text = "\n\n".join(problems) if problems else "No issues found!" |
|
|
verified_text = "\n\n".join(verified) if verified else "No verified references." |
|
|
|
|
|
return summary, issues_text, verified_text |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error processing BibTeX: {str(e)}", "", "" |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks( |
|
|
title="HaRC - Hallucinated Reference Checker", |
|
|
theme=gr.themes.Soft(primary_hue="purple"), |
|
|
) as demo: |
|
|
gr.Markdown(""" |
|
|
# HaRC - Hallucinated Reference Checker |
|
|
|
|
|
Verify your paper's references against academic databases. |
|
|
Catches fake, misspelled, or incorrect citations before submission. |
|
|
|
|
|
**Checks against:** Semantic Scholar, DBLP, Google Scholar, Open Library |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("Upload PDF"): |
|
|
gr.Markdown("Upload your paper and we'll extract and verify the references.") |
|
|
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) |
|
|
pdf_button = gr.Button("Check References", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
pdf_summary = gr.Markdown(label="Summary") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
pdf_issues = gr.Markdown(label="Issues Found") |
|
|
with gr.Column(): |
|
|
pdf_verified = gr.Markdown(label="Verified References") |
|
|
|
|
|
pdf_button.click( |
|
|
fn=process_pdf, |
|
|
inputs=[pdf_input], |
|
|
outputs=[pdf_summary, pdf_issues, pdf_verified], |
|
|
) |
|
|
|
|
|
with gr.TabItem("Paste BibTeX"): |
|
|
gr.Markdown("Paste your `.bib` file contents directly.") |
|
|
bib_input = gr.Textbox( |
|
|
label="BibTeX Content", |
|
|
placeholder="@article{example2023,\n title = {Example Paper},\n author = {John Doe},\n year = {2023}\n}", |
|
|
lines=10, |
|
|
) |
|
|
bib_button = gr.Button("Check References", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
bib_summary = gr.Markdown(label="Summary") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
bib_issues = gr.Markdown(label="Issues Found") |
|
|
with gr.Column(): |
|
|
bib_verified = gr.Markdown(label="Verified References") |
|
|
|
|
|
bib_button.click( |
|
|
fn=process_bibtex, |
|
|
inputs=[bib_input], |
|
|
outputs=[bib_summary, bib_issues, bib_verified], |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
**Note:** PDF reference extraction uses heuristics and may not be 100% accurate. |
|
|
For best results, use the BibTeX tab with your actual `.bib` file. |
|
|
|
|
|
[GitHub](https://github.com/gurusha01/HaRC) | [PyPI](https://pypi.org/project/harcx/) |
|
|
""") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|