Spaces:

Gurusha
/

harc

Runtime error

App Files Files Community

harc / app.py

Gurusha

Initial deploy - HaRC reference checker

94eec12 11 days ago

raw

history blame contribute delete

13.3 kB

	"""HaRC - Hallucinated Reference Checker (Hugging Face Spaces version)."""

	import re
	import tempfile
	from pathlib import Path

	import gradio as gr
	import pymupdf # PyMuPDF

	from reference_checker import check_citations


	def extract_references_section(text: str) -> str:
	"""Extract the references/bibliography section from paper text."""
	# Common section headers for references
	patterns = [
	r'\n\sReferences\s\n',
	r'\n\sREFERENCES\s\n',
	r'\n\sBibliography\s\n',
	r'\n\sBIBLIOGRAPHY\s\n',
	r'\n\sWorks Cited\s\n',
	r'\n\sLiterature Cited\s\n',
	]

	for pattern in patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	return text[match.end():]

	# If no header found, return last 30% of document (often contains refs)
	return text[int(len(text) * 0.7):]


	def parse_references_from_text(text: str) -> list[dict]:
	"""Parse individual references from extracted text.

	Uses heuristics to identify reference boundaries and extract metadata.
	"""
	references = []

	# Clean up text
	text = re.sub(r'\s+', ' ', text)

	# Try to split by common reference patterns
	# Pattern 1: [1], [2], etc.
	numbered_refs = re.split(r'\[\d+\]\s*', text)
	if len(numbered_refs) > 3:
	refs_list = [r.strip() for r in numbered_refs if r.strip()]
	else:
	# Pattern 2: 1. 2. 3. etc at start of line
	refs_list = re.split(r'(?:^\|\n)\d+\.\s+', text)
	refs_list = [r.strip() for r in refs_list if r.strip()]

	if len(refs_list) < 3:
	# Pattern 3: Split by author name patterns (Name, Initial.)
	refs_list = re.split(r'(?<=[.?!])\s+(?=[A-Z][a-z]+,?\s+[A-Z]\.)', text)
	refs_list = [r.strip() for r in refs_list if r.strip() and len(r) > 30]

	for ref_text in refs_list[:100]: # Limit to 100 refs
	ref = parse_single_reference(ref_text)
	if ref and ref.get('title'):
	references.append(ref)

	return references


	def parse_single_reference(text: str) -> dict \| None:
	"""Parse a single reference string into structured data."""
	if len(text) < 20:
	return None

	ref = {}

	# Extract year (4 digits, typically 1900-2099)
	year_match = re.search(r'\b(19\|20)\d{2}\b', text)
	if year_match:
	ref['year'] = year_match.group()

	# Extract DOI if present
	doi_match = re.search(r'10\.\d{4,}/[^\s]+', text)
	if doi_match:
	ref['doi'] = doi_match.group().rstrip('.')

	# Extract arXiv ID if present
	arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', text, re.IGNORECASE)
	if arxiv_match:
	ref['arxiv'] = arxiv_match.group(1)

	# Try to extract title (usually in quotes or after authors, before journal)
	# Pattern: Look for text in quotes
	title_match = re.search(r'["\u201c]([^"\u201d]+)["\u201d]', text)
	if title_match:
	ref['title'] = title_match.group(1).strip()
	else:
	# Heuristic: title is often after year and authors, before journal/venue
	# Take a reasonable chunk after the year
	if year_match:
	after_year = text[year_match.end():].strip()
	# Remove leading punctuation
	after_year = re.sub(r'^[.,)\]]\s*', '', after_year)
	# Take first sentence-like chunk
	title_candidate = re.split(r'[.!?]', after_year)[0].strip()
	if 10 < len(title_candidate) < 200:
	ref['title'] = title_candidate

	# If still no title, try beginning of text (before year)
	if not ref.get('title') and year_match:
	before_year = text[:year_match.start()].strip()
	# Look for the last comma-separated segment before year as potential title
	parts = before_year.rsplit('.', 1)
	if len(parts) > 1 and len(parts[-1].strip()) > 10:
	ref['title'] = parts[-1].strip()

	# Extract authors (usually at the beginning)
	if year_match:
	author_text = text[:year_match.start()].strip()
	# Clean up and extract author names
	author_text = re.sub(r'[,.]$', '', author_text)
	if author_text and len(author_text) < 500:
	# Split by 'and' or comma
	author_parts = re.split(r'\s+and\s+\|,\s*', author_text)
	authors = []
	for part in author_parts:
	part = part.strip()
	# Filter out non-name parts
	if part and len(part) > 2 and not part.isdigit():
	# Check if it looks like a name (has capital letter)
	if re.search(r'[A-Z]', part):
	authors.append(part)
	if authors:
	ref['authors'] = authors[:10] # Limit to 10 authors

	return ref if ref.get('title') else None


	def references_to_bibtex(references: list[dict]) -> str:
	"""Convert references to BibTeX format."""
	entries = []

	for i, ref in enumerate(references):
	key = f"ref{i+1}"
	entry_type = "article"

	fields = []
	if ref.get('title'):
	# Escape special characters
	title = ref['title'].replace('{', '\\{').replace('}', '\\}')
	fields.append(f' title = {{{title}}}')
	if ref.get('authors'):
	authors_str = ' and '.join(ref['authors'])
	fields.append(f' author = {{{authors_str}}}')
	if ref.get('year'):
	fields.append(f' year = {{{ref["year"]}}}')
	if ref.get('doi'):
	fields.append(f' doi = {{{ref["doi"]}}}')
	if ref.get('arxiv'):
	fields.append(f' eprint = {{{ref["arxiv"]}}}')
	fields.append(' archiveprefix = {arXiv}')

	if fields:
	entry = f"@{entry_type}{{{key},\n"
	entry += ",\n".join(fields)
	entry += "\n}"
	entries.append(entry)

	return "\n\n".join(entries)


	def process_pdf(pdf_file) -> tuple[str, str, str]:
	"""Process uploaded PDF and check references.

	Returns: (summary, issues_text, verified_text)
	"""
	if pdf_file is None:
	return "Please upload a PDF file.", "", ""

	try:
	# Extract text from PDF
	doc = pymupdf.open(pdf_file.name)
	full_text = ""
	for page in doc:
	full_text += page.get_text()
	doc.close()

	if not full_text.strip():
	return "Could not extract text from PDF. The file might be scanned/image-based.", "", ""

	# Extract references section
	refs_text = extract_references_section(full_text)

	# Parse references
	references = parse_references_from_text(refs_text)

	if not references:
	return "No references could be extracted from the PDF.", "", ""

	# Convert to BibTeX
	bibtex = references_to_bibtex(references)

	# Save to temp file and check
	with tempfile.NamedTemporaryFile(mode='w', suffix='.bib', delete=False) as f:
	f.write(bibtex)
	bib_path = f.name

	try:
	issues = check_citations(bib_path, verbose=False)
	issue_keys = {r.entry.key for r in issues}
	finally:
	Path(bib_path).unlink(missing_ok=True)

	# Build results
	verified = []
	problems = []

	for i, ref in enumerate(references):
	key = f"ref{i+1}"
	title = ref.get('title', 'Unknown')
	authors = ', '.join(ref.get('authors', [])[:3])
	if len(ref.get('authors', [])) > 3:
	authors += ' et al.'
	year = ref.get('year', '')

	if key in issue_keys:
	issue = next(r for r in issues if r.entry.key == key)
	problems.append(f"{title}\n {authors} ({year})\n Issue: {issue.message}")
	else:
	verified.append(f"{title}\n {authors} ({year})")

	# Summary
	total = len(references)
	verified_count = len(verified)
	issues_count = len(problems)

	summary = f"## Results\n\n"
	summary += f"- Total references found: {total}\n"
	summary += f"- Verified: {verified_count}\n"
	summary += f"- Issues found: {issues_count}\n"

	if issues_count == 0:
	summary += "\n All references verified successfully!"
	elif issues_count > total * 0.5:
	summary += "\n Many issues found - some may be due to parsing errors."

	issues_text = "\n\n".join(problems) if problems else "No issues found!"
	verified_text = "\n\n".join(verified) if verified else "No verified references."

	return summary, issues_text, verified_text

	except Exception as e:
	return f"Error processing PDF: {str(e)}", "", ""


	def process_bibtex(bibtex_text: str) -> tuple[str, str, str]:
	"""Process pasted BibTeX and check references."""
	if not bibtex_text.strip():
	return "Please paste your BibTeX content.", "", ""

	try:
	# Save to temp file
	with tempfile.NamedTemporaryFile(mode='w', suffix='.bib', delete=False) as f:
	f.write(bibtex_text)
	bib_path = f.name

	try:
	from reference_checker.parser import parse_bib_file
	entries = parse_bib_file(bib_path)
	issues = check_citations(bib_path, verbose=False)
	issue_keys = {r.entry.key for r in issues}
	finally:
	Path(bib_path).unlink(missing_ok=True)

	# Build results
	verified = []
	problems = []

	for entry in entries:
	authors = ', '.join(entry.authors[:3])
	if len(entry.authors) > 3:
	authors += ' et al.'

	if entry.key in issue_keys:
	issue = next(r for r in issues if r.entry.key == entry.key)
	problems.append(f"[{entry.key}] {entry.title}\n {authors} ({entry.year})\n Issue: {issue.message}")
	else:
	verified.append(f"[{entry.key}] {entry.title}\n {authors} ({entry.year})")

	# Summary
	total = len(entries)
	verified_count = len(verified)
	issues_count = len(problems)

	summary = f"## Results\n\n"
	summary += f"- Total entries: {total}\n"
	summary += f"- Verified: {verified_count}\n"
	summary += f"- Issues found: {issues_count}\n"

	if issues_count == 0:
	summary += "\n All references verified successfully!"

	issues_text = "\n\n".join(problems) if problems else "No issues found!"
	verified_text = "\n\n".join(verified) if verified else "No verified references."

	return summary, issues_text, verified_text

	except Exception as e:
	return f"Error processing BibTeX: {str(e)}", "", ""


	# Build Gradio interface
	with gr.Blocks(
	title="HaRC - Hallucinated Reference Checker",
	theme=gr.themes.Soft(primary_hue="purple"),
	) as demo:
	gr.Markdown("""
	# HaRC - Hallucinated Reference Checker

	Verify your paper's references against academic databases.
	Catches fake, misspelled, or incorrect citations before submission.

	Checks against: Semantic Scholar, DBLP, Google Scholar, Open Library
	""")

	with gr.Tabs():
	with gr.TabItem("Upload PDF"):
	gr.Markdown("Upload your paper and we'll extract and verify the references.")
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	pdf_button = gr.Button("Check References", variant="primary")

	with gr.Row():
	pdf_summary = gr.Markdown(label="Summary")

	with gr.Row():
	with gr.Column():
	pdf_issues = gr.Markdown(label="Issues Found")
	with gr.Column():
	pdf_verified = gr.Markdown(label="Verified References")

	pdf_button.click(
	fn=process_pdf,
	inputs=[pdf_input],
	outputs=[pdf_summary, pdf_issues, pdf_verified],
	)

	with gr.TabItem("Paste BibTeX"):
	gr.Markdown("Paste your `.bib` file contents directly.")
	bib_input = gr.Textbox(
	label="BibTeX Content",
	placeholder="@article{example2023,\n title = {Example Paper},\n author = {John Doe},\n year = {2023}\n}",
	lines=10,
	)
	bib_button = gr.Button("Check References", variant="primary")

	with gr.Row():
	bib_summary = gr.Markdown(label="Summary")

	with gr.Row():
	with gr.Column():
	bib_issues = gr.Markdown(label="Issues Found")
	with gr.Column():
	bib_verified = gr.Markdown(label="Verified References")

	bib_button.click(
	fn=process_bibtex,
	inputs=[bib_input],
	outputs=[bib_summary, bib_issues, bib_verified],
	)

	gr.Markdown("""
	---
	Note: PDF reference extraction uses heuristics and may not be 100% accurate.
	For best results, use the BibTeX tab with your actual `.bib` file.

	[GitHub](https://github.com/gurusha01/HaRC) \| [PyPI](https://pypi.org/project/harcx/)
	""")


	if __name__ == "__main__":
	demo.launch()