Wenyu Zhang commited on
Commit ·
e83b370
1
Parent(s): e64e9e9
add application file
Browse files- Dockerfile +42 -0
- README.md +49 -5
- app.py +2075 -0
- requirements.txt +12 -0
- start.sh +16 -0
- venues.py +74 -0
Dockerfile
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM lfoppiano/grobid:0.8.1
|
| 2 |
+
|
| 3 |
+
USER root
|
| 4 |
+
|
| 5 |
+
# Install Python 3 and pip
|
| 6 |
+
RUN apt-get update && \
|
| 7 |
+
apt-get install -y python3 python3-pip python3-venv && \
|
| 8 |
+
rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Create a user with UID 1000 (if it doesn't exist) or modify existing
|
| 11 |
+
RUN if ! id -u 1000 > /dev/null 2>&1; then \
|
| 12 |
+
useradd -m -u 1000 user; \
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
WORKDIR /app
|
| 16 |
+
|
| 17 |
+
# Copy requirements and install
|
| 18 |
+
COPY requirements.txt .
|
| 19 |
+
|
| 20 |
+
# Install dependencies
|
| 21 |
+
RUN pip3 install --upgrade pip
|
| 22 |
+
RUN pip3 install --no-cache-dir -r requirements.txt
|
| 23 |
+
|
| 24 |
+
# Copy application code
|
| 25 |
+
COPY app.py .
|
| 26 |
+
COPY venues.py .
|
| 27 |
+
COPY start.sh .
|
| 28 |
+
|
| 29 |
+
# Fix permissions
|
| 30 |
+
RUN chown -R 1000:1000 /app /opt/grobid
|
| 31 |
+
|
| 32 |
+
# Switch to user 1000
|
| 33 |
+
USER 1000
|
| 34 |
+
|
| 35 |
+
# Make start script executable (if not already)
|
| 36 |
+
RUN chmod +x start.sh
|
| 37 |
+
|
| 38 |
+
# Expose Gradio port and Grobid port (optional, for debugging)
|
| 39 |
+
EXPOSE 7860 8070
|
| 40 |
+
|
| 41 |
+
# Command to run the app
|
| 42 |
+
CMD ["./start.sh"]
|
README.md
CHANGED
|
@@ -1,12 +1,56 @@
|
|
| 1 |
---
|
| 2 |
title: CiteAudit
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
-
short_description:
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: CiteAudit
|
| 3 |
+
emoji: 😻
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
+
short_description: Automatic references checker for academic papers
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# CiteAudit ✅
|
| 13 |
+
|
| 14 |
+
**CiteAudit** is an automated tool designed to help researchers verify the citations in their academic papers. It cross-references your bibliography against the [Semantic Scholar API](https://www.semanticscholar.org/product/api) to identify broken links, hallucinations, and metadata errors.
|
| 15 |
+
|
| 16 |
+
## ✨ Features
|
| 17 |
+
|
| 18 |
+
- **Automated Extraction**: Uses [GROBID](https://github.com/kermitt2/grobid) to parse PDFs and intelligently extract citation data.
|
| 19 |
+
- **Verification Engine**: Cross-checks each citation against the massive Semantic Scholar database.
|
| 20 |
+
- **Hallucination Detection**: Flags citations that look plausible but may not exist in the real world.
|
| 21 |
+
- **CSV Export**: Download a detailed report of verified, ambiguous, and failed citations.
|
| 22 |
+
- **Privacy Focused**: No data is stored permanently. Uploaded PDFs are processed in an ephemeral session and deleted after use.
|
| 23 |
+
|
| 24 |
+
## 🚀 How to Use
|
| 25 |
+
|
| 26 |
+
1. **Upload your PDF**: Drag and drop your academic paper (e.g., from Overleaf or arXiv) into the upload box.
|
| 27 |
+
2. **Review Extraction**: The app will automatically detect reference pages and list found citations.
|
| 28 |
+
3. **Verify**: Click **"✅ Verify Citations"**. You can optionally provide your own Semantic Scholar API key for faster checking (recommended for large bibliographies).
|
| 29 |
+
4. **Download Report**: Once finished, download the verification report as a CSV file to fix errors in your manuscript.
|
| 30 |
+
|
| 31 |
+
## � Semantic Scholar API Limits
|
| 32 |
+
|
| 33 |
+
By default, the app uses the free public tier of the Semantic Scholar API, which has rate limits shared across all users.
|
| 34 |
+
|
| 35 |
+
- **For occasional use**: The free tier is usually sufficient.
|
| 36 |
+
- **For heavy use**: If you see "Rate Limited (429)" errors or verify large papers often, we recommend getting a free API key.
|
| 37 |
+
|
| 38 |
+
**How to get a key:**
|
| 39 |
+
1. Request a key from [Semantic Scholar API](https://www.semanticscholar.org/product/api).
|
| 40 |
+
2. Enter it in the "Semantic Scholar API Key" field in the app settings before clicking Verify.
|
| 41 |
+
|
| 42 |
+
## �🛠️ Tech Stack
|
| 43 |
+
|
| 44 |
+
- **Frontend/Backend**: [Gradio](https://gradio.app/) (Python)
|
| 45 |
+
- **PDF Processing**: [PyMuPDF](https://pymupdf.readthedocs.io/)
|
| 46 |
+
- **Citation Parsing**: [GROBID](https://github.com/kermitt2/grobid)
|
| 47 |
+
- **Data Source**: [Semantic Scholar API](https://www.semanticscholar.org/)
|
| 48 |
+
- **Infrastructure**: Docker on Hugging Face Spaces
|
| 49 |
+
|
| 50 |
+
## ⚠️ Disclaimer
|
| 51 |
+
|
| 52 |
+
This tool is an assistant, not a replacement for manual review. Getting a "Verified" status means the citation metadata matches a real paper to a high degree of confidence. Always double-check "Ambiguous" or "Suspected Hallucination" results manually.
|
| 53 |
+
|
| 54 |
+
## 📄 License
|
| 55 |
+
|
| 56 |
+
MIT License. Free to use and modify.
|
app.py
ADDED
|
@@ -0,0 +1,2075 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from gradio_pdf import PDF
|
| 3 |
+
import fitz
|
| 4 |
+
import os
|
| 5 |
+
import tempfile
|
| 6 |
+
import json
|
| 7 |
+
import requests
|
| 8 |
+
import xml.etree.ElementTree as ET
|
| 9 |
+
import re
|
| 10 |
+
import time
|
| 11 |
+
import sys
|
| 12 |
+
from collections import OrderedDict
|
| 13 |
+
import Levenshtein
|
| 14 |
+
import jellyfish
|
| 15 |
+
from unidecode import unidecode
|
| 16 |
+
from venues import VENUE_NAMES, VENUE_ABBREVIATIONS, COMMON_TERMS
|
| 17 |
+
from urlextract import URLExtract
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Semantic Scholar Status Codes
|
| 22 |
+
SEMANTIC_SCHOLAR_STATUS_CODES = {
|
| 23 |
+
200: "OK: Request successful",
|
| 24 |
+
400: "Bad Request: Check parameters",
|
| 25 |
+
401: "Unauthorized: Invalid API key",
|
| 26 |
+
403: "Forbidden: No permission",
|
| 27 |
+
404: "Not Found: Endpoint or resource missing",
|
| 28 |
+
429: "Too Many Requests: Rate limited",
|
| 29 |
+
500: "Internal Server Error: Server-side issue"
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
# Initialize URL extractor
|
| 33 |
+
extractor = URLExtract()
|
| 34 |
+
|
| 35 |
+
def cleanup_old_temp_files(max_age_hours=1):
|
| 36 |
+
"""Clean up old temporary files from /tmp to save disk space.
|
| 37 |
+
|
| 38 |
+
Safe for multi-user: Only deletes files that match our specific app patterns
|
| 39 |
+
and are reliably 'old' (default > 1 hour).
|
| 40 |
+
"""
|
| 41 |
+
import time
|
| 42 |
+
now = time.time()
|
| 43 |
+
cutoff = now - (max_age_hours * 3600)
|
| 44 |
+
|
| 45 |
+
temp_dir = tempfile.gettempdir()
|
| 46 |
+
if not os.path.exists(temp_dir):
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
# patterns to look for (created by NamedTemporaryFile in our app)
|
| 50 |
+
# We look for files ending with our specific suffixes
|
| 51 |
+
target_suffixes = ("_grobid.pdf", "_ref_subset.pdf", "_verifications.csv")
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
for filename in os.listdir(temp_dir):
|
| 55 |
+
if filename.endswith(target_suffixes):
|
| 56 |
+
file_path = os.path.join(temp_dir, filename)
|
| 57 |
+
try:
|
| 58 |
+
# Check age
|
| 59 |
+
if os.path.getmtime(file_path) < cutoff:
|
| 60 |
+
# Double check it's a file, not a directory
|
| 61 |
+
if os.path.isfile(file_path):
|
| 62 |
+
os.unlink(file_path)
|
| 63 |
+
except Exception:
|
| 64 |
+
pass
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"Error during temp file cleanup: {e}")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def normalize_title_for_comparison(title):
|
| 70 |
+
"""Normalize title for similarity comparison: lowercase, remove punctuation."""
|
| 71 |
+
if not title:
|
| 72 |
+
return ""
|
| 73 |
+
# Lowercase and remove all non-alphanumeric/space characters
|
| 74 |
+
normalized = re.sub(r'[^a-zA-Z0-9\s]', ' ', title.lower())
|
| 75 |
+
# Collapse multiple spaces
|
| 76 |
+
return ' '.join(normalized.split())
|
| 77 |
+
|
| 78 |
+
def normalize_api_author(name):
|
| 79 |
+
"""Normalize author name strictly for API-sourced strings.
|
| 80 |
+
Handles 'Last, First' vs 'First Last' robustly.
|
| 81 |
+
"""
|
| 82 |
+
if not name:
|
| 83 |
+
return ""
|
| 84 |
+
|
| 85 |
+
# 1. ASCII normalization
|
| 86 |
+
name = unidecode(name)
|
| 87 |
+
|
| 88 |
+
# 2. Remove "et al" and "etal"
|
| 89 |
+
name = re.sub(r'\b(et\s*al\.?|etal)\b', '', name, flags=re.IGNORECASE).strip()
|
| 90 |
+
|
| 91 |
+
# 3. Detect "Last, First" vs "First Last"
|
| 92 |
+
if "," in name:
|
| 93 |
+
parts = name.split(",", 1)
|
| 94 |
+
surname = parts[0].strip()
|
| 95 |
+
given_name = parts[1].strip() if len(parts) > 1 else ""
|
| 96 |
+
else:
|
| 97 |
+
parts = name.split()
|
| 98 |
+
if not parts: return ""
|
| 99 |
+
if len(parts) == 1:
|
| 100 |
+
surname = parts[0]
|
| 101 |
+
given_name = ""
|
| 102 |
+
else:
|
| 103 |
+
surname = parts[-1]
|
| 104 |
+
# Everything before the last word is given name metadata
|
| 105 |
+
given_name = " ".join(parts[:-1])
|
| 106 |
+
|
| 107 |
+
# 4. Clean up the parts and generate initials
|
| 108 |
+
surname = re.sub(r'[^a-zA-Z]', '', surname).lower()
|
| 109 |
+
|
| 110 |
+
# Process given_name for initials
|
| 111 |
+
# Replace non-alpha with spaces to separate compact initials like 'J.K.'
|
| 112 |
+
given_clean = re.sub(r'[^a-zA-Z]', ' ', given_name).lower()
|
| 113 |
+
given_parts = given_clean.split()
|
| 114 |
+
initials = [g[0] for g in given_parts if g]
|
| 115 |
+
initials_str = " ".join(initials)
|
| 116 |
+
|
| 117 |
+
result = f"{surname} {initials_str}".strip()
|
| 118 |
+
|
| 119 |
+
return result
|
| 120 |
+
|
| 121 |
+
def normalize_d_author(name):
|
| 122 |
+
"""Normalize author name for PDF-sourced strings (simpler logic).
|
| 123 |
+
Takes last word as surname + first initial of first word.
|
| 124 |
+
"""
|
| 125 |
+
if not name:
|
| 126 |
+
return ""
|
| 127 |
+
|
| 128 |
+
# 1. ASCII normalization & strip
|
| 129 |
+
n = unidecode(name).strip()
|
| 130 |
+
|
| 131 |
+
# 2. Check for "Last, First" comma (from parse_names_by_pattern regrouping)
|
| 132 |
+
if "," in n:
|
| 133 |
+
parts = n.split(",", 1)
|
| 134 |
+
surname = re.sub(r'[^a-zA-Z\s]', '', parts[0]).strip().lower()
|
| 135 |
+
if len(parts) > 1:
|
| 136 |
+
# Split the part after comma into words (First Middle)
|
| 137 |
+
given_raw = parts[1].strip()
|
| 138 |
+
# Replace non-alpha with spaces to separate compact initials like 'J.K.'
|
| 139 |
+
given_clean = re.sub(r'[^a-zA-Z]', ' ', given_raw)
|
| 140 |
+
given_parts = given_clean.split()
|
| 141 |
+
# Abbreviate each word
|
| 142 |
+
initials = [g[0].lower() for g in given_parts if g]
|
| 143 |
+
initials_str = " ".join(initials)
|
| 144 |
+
else:
|
| 145 |
+
initials_str = ""
|
| 146 |
+
|
| 147 |
+
else:
|
| 148 |
+
# 3. Fallback: Last word is surname (First Middle Last format)
|
| 149 |
+
# Replace non-alpha with spaces to separate compact initials like 'J.K.'
|
| 150 |
+
n_clean = re.sub(r'[^a-zA-Z]', ' ', n)
|
| 151 |
+
parts = n_clean.split()
|
| 152 |
+
if not parts:
|
| 153 |
+
return ""
|
| 154 |
+
if len(parts) == 1:
|
| 155 |
+
surname = parts[0].lower()
|
| 156 |
+
initials_str = ""
|
| 157 |
+
else:
|
| 158 |
+
surname = parts[-1].lower()
|
| 159 |
+
# All words before the last one are treated as First/Middle names
|
| 160 |
+
# We take the first letter of each to form initials
|
| 161 |
+
initials = [p[0].lower() for p in parts[:-1] if p]
|
| 162 |
+
initials_str = " ".join(initials)
|
| 163 |
+
|
| 164 |
+
result = f"{surname} {initials_str}".strip()
|
| 165 |
+
|
| 166 |
+
return result
|
| 167 |
+
|
| 168 |
+
def calculate_title_similarity(d_title, api_title):
|
| 169 |
+
"""Calculate the similarity between two titles."""
|
| 170 |
+
norm_raw = normalize_title_for_comparison(d_title)
|
| 171 |
+
norm_api = normalize_title_for_comparison(api_title)
|
| 172 |
+
|
| 173 |
+
if not norm_raw or not norm_api:
|
| 174 |
+
return 0.0
|
| 175 |
+
|
| 176 |
+
return Levenshtein.ratio(norm_raw, norm_api)
|
| 177 |
+
|
| 178 |
+
def calculate_citation_recall(candidate_title, raw_citation):
|
| 179 |
+
"""
|
| 180 |
+
Calculate recall: roughly, how much of the candidate title is present in the raw citation?
|
| 181 |
+
We use fuzz matching to find the best substring in raw_citation that matches candidate_title.
|
| 182 |
+
Recall = (Length of Matched Substring) / (Length of Candidate Title)
|
| 183 |
+
Note: Ideally this should be close to 1.0 if the title is fully present.
|
| 184 |
+
"""
|
| 185 |
+
if not candidate_title or not raw_citation:
|
| 186 |
+
return 0.0
|
| 187 |
+
|
| 188 |
+
norm_cand = normalize_title_for_comparison(candidate_title)
|
| 189 |
+
norm_raw = normalize_title_for_comparison(raw_citation)
|
| 190 |
+
|
| 191 |
+
if not norm_cand or not norm_raw:
|
| 192 |
+
return 0.0
|
| 193 |
+
|
| 194 |
+
# Standard fuzzy substring search logic (similar to calculate_title_similarity but focus on length coverage)
|
| 195 |
+
cand_len = len(norm_cand)
|
| 196 |
+
max_score = 0.0
|
| 197 |
+
|
| 198 |
+
# We want to know if norm_cand exists in norm_raw.
|
| 199 |
+
# We search windows of approx size of cand in raw
|
| 200 |
+
for i in range(len(norm_raw)):
|
| 201 |
+
# Check window sizes +/- 10%
|
| 202 |
+
margin = max(3, int(cand_len * 0.1))
|
| 203 |
+
for window_size in range(cand_len - margin, cand_len + margin):
|
| 204 |
+
if window_size <= 0: continue
|
| 205 |
+
if i + window_size > len(norm_raw): break
|
| 206 |
+
|
| 207 |
+
substring = norm_raw[i : i + window_size]
|
| 208 |
+
|
| 209 |
+
# Use Levenshtein.ratio -> gives 2*matches / (len1 + len2)
|
| 210 |
+
# We want to approximate recall: (matches / len_cand)
|
| 211 |
+
# ratio * (len1 + len2) = 2 * matches
|
| 212 |
+
# matches = ratio * (len1 + len2) / 2
|
| 213 |
+
# Recall = matches / len_cand
|
| 214 |
+
|
| 215 |
+
ratio = Levenshtein.ratio(substring, norm_cand)
|
| 216 |
+
estimated_matches = ratio * (len(substring) + len(norm_cand)) / 2
|
| 217 |
+
recall = estimated_matches / len(norm_cand)
|
| 218 |
+
|
| 219 |
+
if recall > max_score:
|
| 220 |
+
max_score = recall
|
| 221 |
+
if max_score > 0.95: return 1.0 # Early exit
|
| 222 |
+
|
| 223 |
+
return min(max_score, 1.0)
|
| 224 |
+
|
| 225 |
+
def calculate_author_similarity(authors1, authors2):
|
| 226 |
+
"""Calculate Jaro-Winkler similarity for author lists (0-1).
|
| 227 |
+
z
|
| 228 |
+
Args:
|
| 229 |
+
authors1: List of author names from original citation (PDF)
|
| 230 |
+
authors2: List of author dicts from Semantic Scholar [{'name': ...}, ...] (API)
|
| 231 |
+
|
| 232 |
+
Returns:
|
| 233 |
+
Refined Jaro-Winkler score (0-1)
|
| 234 |
+
"""
|
| 235 |
+
norm1 = authors1
|
| 236 |
+
norm2 = authors2
|
| 237 |
+
|
| 238 |
+
if not norm1 or not norm2:
|
| 239 |
+
return 0.0
|
| 240 |
+
|
| 241 |
+
# Asymmetric Best-Match: For each PDF author, find the best partner in API list
|
| 242 |
+
best_match_scores = []
|
| 243 |
+
for n1 in norm1:
|
| 244 |
+
max_score = 0.0
|
| 245 |
+
best_partner = None
|
| 246 |
+
for n2 in norm2:
|
| 247 |
+
score = jellyfish.jaro_winkler_similarity(n1, n2)
|
| 248 |
+
if score > max_score:
|
| 249 |
+
max_score = score
|
| 250 |
+
best_partner = n2
|
| 251 |
+
best_match_scores.append(max_score)
|
| 252 |
+
sys.stdout.flush()
|
| 253 |
+
|
| 254 |
+
# Average best matches
|
| 255 |
+
avg_score = sum(best_match_scores) / len(best_match_scores) if best_match_scores else 0.0
|
| 256 |
+
|
| 257 |
+
# Hallucination Penalty: If PDF lists more authors than API has returned
|
| 258 |
+
# (Allow a small buffer of 1 for minor parsing differences)
|
| 259 |
+
if len(norm1) > len(norm2) + 1:
|
| 260 |
+
penalty = len(norm2) / len(norm1)
|
| 261 |
+
avg_score *= penalty
|
| 262 |
+
return avg_score
|
| 263 |
+
|
| 264 |
+
def discover_metadata_in_raw(raw_text, api_title, api_authors, is_exact_match=False):
|
| 265 |
+
"""
|
| 266 |
+
Search for the title and author segments in the raw text based on API results.
|
| 267 |
+
Returns: (title_after_verification, authors_after_verification) strings or empty.
|
| 268 |
+
"""
|
| 269 |
+
if not raw_text:
|
| 270 |
+
return "", ""
|
| 271 |
+
|
| 272 |
+
discovered_title = ""
|
| 273 |
+
discovered_authors = ""
|
| 274 |
+
|
| 275 |
+
# We create a normalized string AND a mapping from normalized index to original index
|
| 276 |
+
norm_raw = []
|
| 277 |
+
norm_to_orig = []
|
| 278 |
+
|
| 279 |
+
last_was_space = True # Start true to ignore leading non-alnum
|
| 280 |
+
|
| 281 |
+
for i, char in enumerate(raw_text):
|
| 282 |
+
if char.isalnum():
|
| 283 |
+
norm_raw.append(char.lower())
|
| 284 |
+
norm_to_orig.append(i)
|
| 285 |
+
last_was_space = False
|
| 286 |
+
else:
|
| 287 |
+
if not last_was_space:
|
| 288 |
+
norm_raw.append(' ')
|
| 289 |
+
norm_to_orig.append(i)
|
| 290 |
+
last_was_space = True
|
| 291 |
+
|
| 292 |
+
norm_raw_str = "".join(norm_raw)
|
| 293 |
+
|
| 294 |
+
# 1. Discover Title Segment
|
| 295 |
+
if is_exact_match:
|
| 296 |
+
discovered_title = api_title
|
| 297 |
+
elif api_title:
|
| 298 |
+
# Also clean API title with spaces
|
| 299 |
+
api_dirty = api_title.lower()
|
| 300 |
+
norm_api_list = []
|
| 301 |
+
last_space = True
|
| 302 |
+
for c in api_dirty:
|
| 303 |
+
if c.isalnum():
|
| 304 |
+
norm_api_list.append(c)
|
| 305 |
+
last_space = False
|
| 306 |
+
else:
|
| 307 |
+
if not last_space:
|
| 308 |
+
norm_api_list.append(' ')
|
| 309 |
+
last_space = True
|
| 310 |
+
norm_api = "".join(norm_api_list).strip()
|
| 311 |
+
|
| 312 |
+
if norm_api and norm_raw_str:
|
| 313 |
+
api_len = len(norm_api)
|
| 314 |
+
best_window = None
|
| 315 |
+
max_score = 0.0
|
| 316 |
+
|
| 317 |
+
for i in range(len(norm_raw_str)):
|
| 318 |
+
if i + api_len > len(norm_raw_str) + 5: break
|
| 319 |
+
|
| 320 |
+
for delta in [0, -1, 1, -2, 2, -3, 3]:
|
| 321 |
+
window_size = api_len + delta
|
| 322 |
+
if window_size <= 0: continue
|
| 323 |
+
if i + window_size > len(norm_raw_str): continue
|
| 324 |
+
|
| 325 |
+
substring = norm_raw_str[i : i + window_size]
|
| 326 |
+
|
| 327 |
+
score = Levenshtein.ratio(substring, norm_api)
|
| 328 |
+
|
| 329 |
+
if score > max_score:
|
| 330 |
+
max_score = score
|
| 331 |
+
best_window = (i, i + window_size)
|
| 332 |
+
|
| 333 |
+
# Perfect match optimization
|
| 334 |
+
if max_score > 0.99: break
|
| 335 |
+
if max_score > 0.99: break
|
| 336 |
+
|
| 337 |
+
# If we found a good match (> 0.75)
|
| 338 |
+
if max_score > 0.75 and best_window:
|
| 339 |
+
start_norm, end_norm = best_window
|
| 340 |
+
|
| 341 |
+
if start_norm < len(norm_to_orig) and end_norm <= len(norm_to_orig):
|
| 342 |
+
orig_start_idx = norm_to_orig[start_norm]
|
| 343 |
+
orig_end_idx = norm_to_orig[end_norm - 1]
|
| 344 |
+
|
| 345 |
+
raw_slice = raw_text[orig_start_idx : orig_end_idx + 1]
|
| 346 |
+
discovered_title = raw_slice.strip()
|
| 347 |
+
else:
|
| 348 |
+
discovered_title = api_title
|
| 349 |
+
else:
|
| 350 |
+
discovered_title = api_title
|
| 351 |
+
else:
|
| 352 |
+
discovered_title = api_title
|
| 353 |
+
|
| 354 |
+
# 2. Discover Author Segment
|
| 355 |
+
# We take everything from the beginning until the start of the title
|
| 356 |
+
|
| 357 |
+
author_limit_idx = -1
|
| 358 |
+
|
| 359 |
+
# Strategy A: Use Discovered Title Start
|
| 360 |
+
if discovered_title and discovered_title in raw_text:
|
| 361 |
+
author_limit_idx = raw_text.find(discovered_title)
|
| 362 |
+
|
| 363 |
+
# Strategy B: Use Year (Fail-safe)
|
| 364 |
+
year_match = re.search(r'\b(19|20|21)\d{2}\b', raw_text)
|
| 365 |
+
if year_match:
|
| 366 |
+
year_idx = year_match.start()
|
| 367 |
+
if author_limit_idx == -1 or year_idx < author_limit_idx:
|
| 368 |
+
author_limit_idx = year_idx
|
| 369 |
+
|
| 370 |
+
if author_limit_idx > 0:
|
| 371 |
+
segment = raw_text[:author_limit_idx]
|
| 372 |
+
discovered_authors = segment.strip().rstrip(".,:; ")
|
| 373 |
+
else:
|
| 374 |
+
if api_authors:
|
| 375 |
+
api_names = []
|
| 376 |
+
if isinstance(api_authors[0], dict):
|
| 377 |
+
api_names = [a.get('name', '') for a in api_authors if a.get('name')]
|
| 378 |
+
else:
|
| 379 |
+
api_names = [str(a) for a in api_authors]
|
| 380 |
+
|
| 381 |
+
found_indices = []
|
| 382 |
+
norm_raw_str_full = raw_text.lower()
|
| 383 |
+
|
| 384 |
+
for name in api_names:
|
| 385 |
+
parts = name.lower().split()
|
| 386 |
+
if len(parts) >= 2:
|
| 387 |
+
p = re.escape(parts[0]) + r'.*?' + re.escape(parts[-1])
|
| 388 |
+
m = re.search(p, norm_raw_str_full)
|
| 389 |
+
if m:
|
| 390 |
+
found_indices.append(m.end())
|
| 391 |
+
|
| 392 |
+
if found_indices:
|
| 393 |
+
last_author_end = max(found_indices)
|
| 394 |
+
discovered_authors = raw_text[:last_author_end].strip().rstrip(".,;:")
|
| 395 |
+
|
| 396 |
+
return discovered_title, discovered_authors
|
| 397 |
+
|
| 398 |
+
def classify_verification(title_score, author_score, has_error=False, error_msg=""):
|
| 399 |
+
"""Classify verification status based on weighted similarity scores.
|
| 400 |
+
|
| 401 |
+
Weights: 70% Title, 30% Authors
|
| 402 |
+
|
| 403 |
+
Returns:
|
| 404 |
+
dict with 'status', 'icon', 'title_score', 'author_score', 'confidence', 'error'
|
| 405 |
+
"""
|
| 406 |
+
if has_error:
|
| 407 |
+
return {
|
| 408 |
+
'status': 'api_error',
|
| 409 |
+
'icon': '✗',
|
| 410 |
+
'title_score': 0.0,
|
| 411 |
+
'author_score': 0.0,
|
| 412 |
+
'confidence': 0.0,
|
| 413 |
+
'error': error_msg
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
# Weighted Hybrid Score
|
| 417 |
+
confidence = (title_score * 0.70) + (author_score * 0.30)
|
| 418 |
+
|
| 419 |
+
# Threshold classification
|
| 420 |
+
if confidence >= 0.95:
|
| 421 |
+
return {
|
| 422 |
+
'status': 'verified',
|
| 423 |
+
'icon': '✓',
|
| 424 |
+
'title_score': title_score,
|
| 425 |
+
'author_score': author_score,
|
| 426 |
+
'confidence': confidence
|
| 427 |
+
}
|
| 428 |
+
elif confidence >= 0.75:
|
| 429 |
+
return {
|
| 430 |
+
'status': 'ambiguous',
|
| 431 |
+
'icon': '⚠',
|
| 432 |
+
'title_score': title_score,
|
| 433 |
+
'author_score': author_score,
|
| 434 |
+
'confidence': confidence
|
| 435 |
+
}
|
| 436 |
+
else:
|
| 437 |
+
return {
|
| 438 |
+
'status': 'suspected_hallucination',
|
| 439 |
+
'icon': '⚠⚠',
|
| 440 |
+
'title_score': title_score,
|
| 441 |
+
'author_score': author_score,
|
| 442 |
+
'confidence': confidence
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def verify_citation_against_paper(raw_citation, api_paper, extracted_title, name_order="first_last", separator=","):
|
| 447 |
+
"""
|
| 448 |
+
Verify a citation against a paper using discovery with global pattern awareness.
|
| 449 |
+
"""
|
| 450 |
+
api_title = api_paper.get('title', '')
|
| 451 |
+
api_authors_list = api_paper.get('authors', [])
|
| 452 |
+
|
| 453 |
+
# Pre-normalize API authors (Ground Truth)
|
| 454 |
+
api_authors_norm = []
|
| 455 |
+
if api_authors_list:
|
| 456 |
+
# SS API returns [{'name': ...}, ...] or just list of names
|
| 457 |
+
if isinstance(api_authors_list[0], dict):
|
| 458 |
+
api_authors_norm = [normalize_api_author(a.get('name', '')) for a in api_authors_list if a.get('name')]
|
| 459 |
+
else:
|
| 460 |
+
api_authors_norm = [normalize_api_author(str(a)) for a in api_authors_list if a]
|
| 461 |
+
|
| 462 |
+
# --- TITLE SELECTION LOGIC ---
|
| 463 |
+
best_title_candidate = None
|
| 464 |
+
title_source = ""
|
| 465 |
+
is_exact_match = False
|
| 466 |
+
|
| 467 |
+
if extracted_title and api_title:
|
| 468 |
+
norm_extracted = normalize_title_for_comparison(extracted_title)
|
| 469 |
+
norm_api = normalize_title_for_comparison(api_title)
|
| 470 |
+
|
| 471 |
+
if norm_extracted == norm_api and len(norm_extracted) > 10:
|
| 472 |
+
is_exact_match = True
|
| 473 |
+
best_title_candidate = extracted_title
|
| 474 |
+
title_source = "exact_match"
|
| 475 |
+
|
| 476 |
+
if not is_exact_match:
|
| 477 |
+
# Compare extracted_title vs api_title based on RECALL of raw_citation
|
| 478 |
+
recall_extracted = calculate_citation_recall(extracted_title, raw_citation) if extracted_title else 0.0
|
| 479 |
+
recall_api = calculate_citation_recall(api_title, raw_citation)
|
| 480 |
+
|
| 481 |
+
# Tie-breaker: If recall is the same, pick the one with fewer words
|
| 482 |
+
if abs(recall_extracted - recall_api) < 1e-7:
|
| 483 |
+
# Tie case
|
| 484 |
+
words_ext = len(extracted_title.split()) if extracted_title else 999
|
| 485 |
+
words_api = len(api_title.split()) if api_title else 999
|
| 486 |
+
if words_ext < words_api:
|
| 487 |
+
best_title_candidate = extracted_title
|
| 488 |
+
title_source = "extracted (tie-breaker shorter)"
|
| 489 |
+
else:
|
| 490 |
+
best_title_candidate = api_title
|
| 491 |
+
title_source = "api (tie-breaker shorter)"
|
| 492 |
+
elif recall_extracted > (recall_api + 0.1):
|
| 493 |
+
best_title_candidate = extracted_title
|
| 494 |
+
title_source = "cleaned/extracted"
|
| 495 |
+
else:
|
| 496 |
+
best_title_candidate = api_title
|
| 497 |
+
title_source = "api"
|
| 498 |
+
|
| 499 |
+
# 1. Discovery Step
|
| 500 |
+
d_title, d_authors = discover_metadata_in_raw(raw_citation, best_title_candidate, api_authors_list, is_exact_match=is_exact_match)
|
| 501 |
+
|
| 502 |
+
# 2. Scoring Step: Compare the DISCOVERED title against the API title (Ground Truth)
|
| 503 |
+
if d_title:
|
| 504 |
+
t_score = calculate_title_similarity(d_title, api_title)
|
| 505 |
+
else:
|
| 506 |
+
# Fallback if discovery failed
|
| 507 |
+
# If discovery failed, score is 0 as we couldn't find the title segment
|
| 508 |
+
t_score = 0.0
|
| 509 |
+
|
| 510 |
+
# 3. Author Scoring Step
|
| 511 |
+
if d_authors:
|
| 512 |
+
# Detect "et al" in original segments (case-insensitive)
|
| 513 |
+
has_etal = re.search(r'\bet\s*al\b', d_authors, re.IGNORECASE)
|
| 514 |
+
|
| 515 |
+
# Use the global pattern and separator for surgery parsing
|
| 516 |
+
parsed_d_authors = parse_names_by_pattern(d_authors, name_order, separator)
|
| 517 |
+
|
| 518 |
+
score_forward = calculate_author_similarity(parsed_d_authors, api_authors_norm)
|
| 519 |
+
|
| 520 |
+
if has_etal:
|
| 521 |
+
a_score = score_forward
|
| 522 |
+
else:
|
| 523 |
+
score_backward = calculate_author_similarity(api_authors_norm, parsed_d_authors)
|
| 524 |
+
a_score = (0.5 * score_forward) + (0.5 * score_backward)
|
| 525 |
+
sys.stdout.flush()
|
| 526 |
+
else:
|
| 527 |
+
# If discovery failed to find an author segment, score is 0.0
|
| 528 |
+
a_score = 0.0
|
| 529 |
+
|
| 530 |
+
check_data = classify_verification(t_score, a_score)
|
| 531 |
+
check_data['semantic_data'] = api_paper
|
| 532 |
+
check_data['title_source'] = title_source
|
| 533 |
+
|
| 534 |
+
# Enhance check_data with discovery info
|
| 535 |
+
check_data['discovery'] = (d_title, d_authors)
|
| 536 |
+
|
| 537 |
+
return check_data, (d_title, d_authors)
|
| 538 |
+
|
| 539 |
+
def check_citations_semantic_scholar(citations_to_check, api_key=None, name_order="first_last", separator=","):
|
| 540 |
+
"""Check citations using Semantic Scholar API as a generator.
|
| 541 |
+
|
| 542 |
+
Args:
|
| 543 |
+
citations_to_check: List of citations to verify
|
| 544 |
+
api_key: Optional Semantic Scholar API key for higher rate limits
|
| 545 |
+
|
| 546 |
+
Yields:
|
| 547 |
+
Verified citation dictionary for each input citation
|
| 548 |
+
"""
|
| 549 |
+
for i, cit in enumerate(citations_to_check):
|
| 550 |
+
raw_text = cit.get('raw_text', '').strip()
|
| 551 |
+
title = cit.get('title', '').strip()
|
| 552 |
+
|
| 553 |
+
# Use the original PDF strings for verification
|
| 554 |
+
raw_citation = cit.get('raw_text', '').strip()
|
| 555 |
+
cleaned_title = title
|
| 556 |
+
|
| 557 |
+
try:
|
| 558 |
+
check_data = {'status': 'not_found', 'semantic_data': None}
|
| 559 |
+
found_stage1 = False
|
| 560 |
+
response = None
|
| 561 |
+
|
| 562 |
+
def make_request(url, p, h):
|
| 563 |
+
max_retries = 3
|
| 564 |
+
retry_cnt = 0
|
| 565 |
+
while retry_cnt <= max_retries:
|
| 566 |
+
try:
|
| 567 |
+
resp = requests.get(url, params=p, headers=h, timeout=10)
|
| 568 |
+
if resp.status_code == 429:
|
| 569 |
+
if retry_cnt < max_retries:
|
| 570 |
+
w_time = 2 ** retry_cnt
|
| 571 |
+
time.sleep(w_time)
|
| 572 |
+
retry_cnt += 1
|
| 573 |
+
else:
|
| 574 |
+
return resp
|
| 575 |
+
else:
|
| 576 |
+
return resp
|
| 577 |
+
except requests.exceptions.Timeout:
|
| 578 |
+
retry_cnt += 1
|
| 579 |
+
except Exception as e:
|
| 580 |
+
return None
|
| 581 |
+
return None
|
| 582 |
+
|
| 583 |
+
headers = {}
|
| 584 |
+
if api_key:
|
| 585 |
+
headers['x-api-key'] = api_key
|
| 586 |
+
|
| 587 |
+
if cleaned_title:
|
| 588 |
+
# --- STAGE 1: Direct Match (/match) by Title ---
|
| 589 |
+
match_url = "https://api.semanticscholar.org/graph/v1/paper/search/match"
|
| 590 |
+
|
| 591 |
+
params = {
|
| 592 |
+
'query': cleaned_title,
|
| 593 |
+
'fields': 'title,authors,year,venue'
|
| 594 |
+
}
|
| 595 |
+
response = make_request(match_url, params, headers)
|
| 596 |
+
|
| 597 |
+
if response is not None:
|
| 598 |
+
status_desc = SEMANTIC_SCHOLAR_STATUS_CODES.get(response.status_code, f"Unknown ({response.status_code})")
|
| 599 |
+
|
| 600 |
+
if response.status_code == 200:
|
| 601 |
+
resp_json = response.json()
|
| 602 |
+
if resp_json.get('data') and len(resp_json['data']) > 0:
|
| 603 |
+
paper = resp_json['data'][0]
|
| 604 |
+
if paper and paper.get('paperId'):
|
| 605 |
+
found_stage1 = True
|
| 606 |
+
|
| 607 |
+
# --- UNIFIED VERIFICATION LOGIC ---
|
| 608 |
+
check_data, discovery = verify_citation_against_paper(
|
| 609 |
+
raw_citation,
|
| 610 |
+
paper,
|
| 611 |
+
cleaned_title, # extracted_title
|
| 612 |
+
name_order=name_order,
|
| 613 |
+
separator=separator
|
| 614 |
+
)
|
| 615 |
+
d_title, d_authors = discovery
|
| 616 |
+
|
| 617 |
+
# Store discovery results
|
| 618 |
+
cit['title_after_verification'] = d_title
|
| 619 |
+
cit['authors_after_verification'] = d_authors
|
| 620 |
+
|
| 621 |
+
elif response.status_code in [400, 401, 403]:
|
| 622 |
+
found_stage1 = True
|
| 623 |
+
check_data = classify_verification(0, 0, has_error=True, error_msg=status_desc)
|
| 624 |
+
else:
|
| 625 |
+
found_stage1 = True
|
| 626 |
+
check_data = classify_verification(0, 0, has_error=True, error_msg="No Response")
|
| 627 |
+
|
| 628 |
+
# --- STAGE 2: Fallback Search (/search) if Stage 1 failed ---
|
| 629 |
+
if not found_stage1:
|
| 630 |
+
if response and response.status_code == 429:
|
| 631 |
+
check_data = classify_verification(0, 0, has_error=True, error_msg="Rate Limited (429)")
|
| 632 |
+
else:
|
| 633 |
+
search_url = "https://api.semanticscholar.org/graph/v1/paper/search"
|
| 634 |
+
|
| 635 |
+
# We try up to two different search queries to maximize recall
|
| 636 |
+
queries_to_try = []
|
| 637 |
+
if cleaned_title:
|
| 638 |
+
queries_to_try.append(("Title", cleaned_title))
|
| 639 |
+
queries_to_try.append(("Raw Citation", raw_citation))
|
| 640 |
+
|
| 641 |
+
all_candidates = {} # paperId -> paper_data
|
| 642 |
+
|
| 643 |
+
for q_type, q_string in queries_to_try:
|
| 644 |
+
search_params = {
|
| 645 |
+
'query': q_string,
|
| 646 |
+
'limit': 5,
|
| 647 |
+
'fields': 'title,authors,year,venue'
|
| 648 |
+
}
|
| 649 |
+
s_resp = make_request(search_url, search_params, headers)
|
| 650 |
+
|
| 651 |
+
if s_resp and s_resp.status_code == 200:
|
| 652 |
+
data = s_resp.json().get('data', [])
|
| 653 |
+
for paper in data:
|
| 654 |
+
pid = paper.get('paperId')
|
| 655 |
+
if pid and pid not in all_candidates:
|
| 656 |
+
all_candidates[pid] = paper
|
| 657 |
+
elif s_resp and s_resp.status_code == 429:
|
| 658 |
+
break # Stop trying queries if rate limited
|
| 659 |
+
if all_candidates:
|
| 660 |
+
results_list = list(all_candidates.values())
|
| 661 |
+
|
| 662 |
+
# --- STAGE 2 OPTIMIZATION: SELECT BEST API GROUND TRUTH BY RECALL ---
|
| 663 |
+
# 1. Find the API paper whose title has the highest recall against raw citation
|
| 664 |
+
best_api_paper = None
|
| 665 |
+
max_api_recall = -1.0
|
| 666 |
+
min_word_count = 999
|
| 667 |
+
|
| 668 |
+
for paper in results_list:
|
| 669 |
+
title = paper.get('title', '')
|
| 670 |
+
rec = calculate_citation_recall(title, raw_citation)
|
| 671 |
+
word_count = len(title.split()) if title else 999
|
| 672 |
+
|
| 673 |
+
if rec > max_api_recall:
|
| 674 |
+
max_api_recall = rec
|
| 675 |
+
min_word_count = word_count
|
| 676 |
+
best_api_paper = paper
|
| 677 |
+
elif abs(rec - max_api_recall) < 1e-7:
|
| 678 |
+
# Tie in recall, check word count
|
| 679 |
+
if word_count < min_word_count:
|
| 680 |
+
min_word_count = word_count
|
| 681 |
+
best_api_paper = paper
|
| 682 |
+
|
| 683 |
+
if best_api_paper:
|
| 684 |
+
# 2. Verify using this Best API Paper
|
| 685 |
+
# The helper function will automatically decide whether to use the
|
| 686 |
+
# Best API Title OR the Extracted Title as the 'Anchor' for discovery.
|
| 687 |
+
check_data, discovery = verify_citation_against_paper(
|
| 688 |
+
raw_citation,
|
| 689 |
+
best_api_paper,
|
| 690 |
+
cleaned_title,
|
| 691 |
+
name_order=name_order,
|
| 692 |
+
separator=separator
|
| 693 |
+
)
|
| 694 |
+
|
| 695 |
+
# Finalize discovery data on the citation object
|
| 696 |
+
cit['title_after_verification'], cit['authors_after_verification'] = discovery
|
| 697 |
+
|
| 698 |
+
if check_data.get('confidence', 0) < 0.4:
|
| 699 |
+
check_data = classify_verification(0, 0, has_error=True, error_msg="Low confidence match")
|
| 700 |
+
else:
|
| 701 |
+
check_data = classify_verification(0, 0, has_error=True, error_msg="No suitable API candidate found")
|
| 702 |
+
|
| 703 |
+
else:
|
| 704 |
+
check_data = classify_verification(0, 0, has_error=True, error_msg="No search results found by API")
|
| 705 |
+
sys.stdout.flush()
|
| 706 |
+
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
cit['verification'] = check_data
|
| 710 |
+
yield cit
|
| 711 |
+
|
| 712 |
+
except Exception as e:
|
| 713 |
+
cit['verification'] = classify_verification(0, 0, has_error=True, error_msg=str(e))
|
| 714 |
+
yield cit
|
| 715 |
+
sys.stdout.flush()
|
| 716 |
+
|
| 717 |
+
# Rate limiting: wait 1 second between requests to avoid 429 errors (only if no API key)
|
| 718 |
+
if not api_key and i < len(citations_to_check) - 1:
|
| 719 |
+
time.sleep(1)
|
| 720 |
+
|
| 721 |
+
def parse_tei_citations(tei_xml):
|
| 722 |
+
"""Parse TEI XML and extract citations."""
|
| 723 |
+
try:
|
| 724 |
+
root = ET.fromstring(tei_xml)
|
| 725 |
+
citations = []
|
| 726 |
+
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
|
| 727 |
+
|
| 728 |
+
for bibl in root.findall('.//tei:listBibl/tei:biblStruct', ns):
|
| 729 |
+
citation = {}
|
| 730 |
+
|
| 731 |
+
# Extract title
|
| 732 |
+
title_elem = bibl.find('.//tei:title[@level="a"]', ns)
|
| 733 |
+
used_monograph_as_title = False
|
| 734 |
+
|
| 735 |
+
if title_elem is None:
|
| 736 |
+
title_elem = bibl.find('.//tei:title[@level="m"]', ns)
|
| 737 |
+
if title_elem is not None:
|
| 738 |
+
used_monograph_as_title = True
|
| 739 |
+
|
| 740 |
+
if title_elem is not None and title_elem.text:
|
| 741 |
+
citation['title'] = title_elem.text.strip()
|
| 742 |
+
|
| 743 |
+
# Extract authors
|
| 744 |
+
authors = []
|
| 745 |
+
for author in bibl.findall('.//tei:author', ns):
|
| 746 |
+
persName = author.find('.//tei:persName', ns)
|
| 747 |
+
if persName is not None:
|
| 748 |
+
forename = persName.find('.//tei:forename', ns)
|
| 749 |
+
surname = persName.find('.//tei:surname', ns)
|
| 750 |
+
name_parts = []
|
| 751 |
+
if forename is not None and forename.text:
|
| 752 |
+
name_parts.append(forename.text.strip())
|
| 753 |
+
if surname is not None and surname.text:
|
| 754 |
+
name_parts.append(surname.text.strip())
|
| 755 |
+
if name_parts:
|
| 756 |
+
authors.append(' '.join(name_parts))
|
| 757 |
+
|
| 758 |
+
if authors:
|
| 759 |
+
citation['authors'] = authors
|
| 760 |
+
|
| 761 |
+
# Extract year
|
| 762 |
+
date_elem = bibl.find('.//tei:date[@type="published"]', ns)
|
| 763 |
+
if date_elem is not None and date_elem.get('when'):
|
| 764 |
+
citation['year'] = date_elem.get('when')
|
| 765 |
+
|
| 766 |
+
# Extract venue/journal - check multiple possible locations
|
| 767 |
+
venue_elem = bibl.find('.//tei:title[@level="j"]', ns) # Journal
|
| 768 |
+
|
| 769 |
+
if venue_elem is None and not used_monograph_as_title:
|
| 770 |
+
venue_elem = bibl.find('.//tei:title[@level="m"]', ns) # Monograph/Book
|
| 771 |
+
|
| 772 |
+
if venue_elem is None:
|
| 773 |
+
venue_elem = bibl.find('.//tei:meeting', ns) # Conference
|
| 774 |
+
|
| 775 |
+
if venue_elem is not None and venue_elem.text:
|
| 776 |
+
citation['venue'] = venue_elem.text.strip()
|
| 777 |
+
|
| 778 |
+
# Also try to get publisher if no venue found
|
| 779 |
+
if 'venue' not in citation:
|
| 780 |
+
publisher_elem = bibl.find('.//tei:publisher', ns)
|
| 781 |
+
if publisher_elem is not None and publisher_elem.text:
|
| 782 |
+
citation['venue'] = publisher_elem.text.strip()
|
| 783 |
+
|
| 784 |
+
if citation:
|
| 785 |
+
# Extract raw_reference text - this becomes the display text
|
| 786 |
+
raw_ref_elem = bibl.find('.//tei:note[@type="raw_reference"]', ns)
|
| 787 |
+
if raw_ref_elem is not None:
|
| 788 |
+
raw_ref_text = "".join(raw_ref_elem.itertext()).strip()
|
| 789 |
+
raw_ref_text = re.sub(r'\s+', ' ', raw_ref_text)
|
| 790 |
+
citation['raw_text'] = raw_ref_text
|
| 791 |
+
else:
|
| 792 |
+
# Fallback to biblStruct text if no raw_reference
|
| 793 |
+
raw_text = "".join(bibl.itertext()).strip()
|
| 794 |
+
raw_text = re.sub(r'\s+', ' ', raw_text)
|
| 795 |
+
citation['raw_text'] = raw_text
|
| 796 |
+
|
| 797 |
+
# Store entire biblStruct XML for parsing
|
| 798 |
+
citation['grobid_xml'] = ET.tostring(bibl, encoding='unicode')
|
| 799 |
+
|
| 800 |
+
citations.append(citation)
|
| 801 |
+
|
| 802 |
+
return citations
|
| 803 |
+
except Exception as e:
|
| 804 |
+
return []
|
| 805 |
+
|
| 806 |
+
def extract_title_and_authors_from_xml(xml_string):
|
| 807 |
+
"""Extract title and authors from GROBID biblStruct XML.
|
| 808 |
+
|
| 809 |
+
Args:
|
| 810 |
+
xml_string: XML string of biblStruct element
|
| 811 |
+
|
| 812 |
+
Returns:
|
| 813 |
+
Dictionary with 'title' and 'authors' fields
|
| 814 |
+
"""
|
| 815 |
+
try:
|
| 816 |
+
root = ET.fromstring(xml_string)
|
| 817 |
+
ns = {'ns0': 'http://www.tei-c.org/ns/1.0', 'tei': 'http://www.tei-c.org/ns/1.0'}
|
| 818 |
+
|
| 819 |
+
result = {}
|
| 820 |
+
|
| 821 |
+
# Extract title - try multiple paths
|
| 822 |
+
title_elem = root.find('.//ns0:title[@level="a"][@type="main"]', ns)
|
| 823 |
+
if title_elem is None:
|
| 824 |
+
title_elem = root.find('.//ns0:title[@level="a"]', ns)
|
| 825 |
+
if title_elem is None:
|
| 826 |
+
title_elem = root.find('.//ns0:title[@level="m"]', ns)
|
| 827 |
+
if title_elem is None:
|
| 828 |
+
title_elem = root.find('.//ns0:title', ns)
|
| 829 |
+
if title_elem is None:
|
| 830 |
+
title_elem = root.find('.//tei:title[@level="a"][@type="main"]', ns)
|
| 831 |
+
if title_elem is None:
|
| 832 |
+
title_elem = root.find('.//tei:title[@level="a"]', ns)
|
| 833 |
+
if title_elem is None:
|
| 834 |
+
title_elem = root.find('.//tei:title', ns)
|
| 835 |
+
|
| 836 |
+
if title_elem is not None and title_elem.text:
|
| 837 |
+
result['title'] = title_elem.text.strip()
|
| 838 |
+
|
| 839 |
+
result['authors'] = []
|
| 840 |
+
|
| 841 |
+
return result
|
| 842 |
+
|
| 843 |
+
except Exception as e:
|
| 844 |
+
return {}
|
| 845 |
+
|
| 846 |
+
def clean_metadata(text):
|
| 847 |
+
"""Clean title or author string specifically by removing segments that contain known publication venues or URLs.
|
| 848 |
+
|
| 849 |
+
Splits text by common punctuation (.,:;?!), checks each segment for venue names
|
| 850 |
+
(case-insensitive), abbreviations (case-sensitive), or URLs, and removes contaminated segments.
|
| 851 |
+
"""
|
| 852 |
+
if not text:
|
| 853 |
+
return ""
|
| 854 |
+
|
| 855 |
+
# Pre-cleaning: Remove parentheses symbols but keep the content
|
| 856 |
+
text = text.replace('(', '').replace(')', '')
|
| 857 |
+
|
| 858 |
+
# Define additional DOI/Arxiv extraction terms that might not be caught by URLExtract
|
| 859 |
+
extra_patterns = r'arxiv\.org|doi\.org|\bdoi:|\burl\b'
|
| 860 |
+
|
| 861 |
+
# 1. Protect URLs during splitting using URLExtract
|
| 862 |
+
# We find all URL matches and replace them with placeholders
|
| 863 |
+
placeholders = []
|
| 864 |
+
temp_text = text
|
| 865 |
+
|
| 866 |
+
# Get all URLs from the text
|
| 867 |
+
urls = extractor.find_urls(text, True)
|
| 868 |
+
|
| 869 |
+
# Sort by length descending to avoid partial replacement issues
|
| 870 |
+
for url in sorted(list(set(urls)), key=len, reverse=True):
|
| 871 |
+
placeholder = f"__URL_PH_{len(placeholders)}__"
|
| 872 |
+
placeholders.append(url)
|
| 873 |
+
temp_text = temp_text.replace(url, placeholder)
|
| 874 |
+
|
| 875 |
+
# Also handle the explicitly requested labels like doi:
|
| 876 |
+
def replace_extra(match):
|
| 877 |
+
placeholder = f"__URL_PH_{len(placeholders)}__"
|
| 878 |
+
placeholders.append(match.group(0))
|
| 879 |
+
return placeholder
|
| 880 |
+
|
| 881 |
+
temp_text = re.sub(extra_patterns, replace_extra, temp_text, flags=re.IGNORECASE)
|
| 882 |
+
|
| 883 |
+
# 2. Split by punctuation (period, question mark, exclamation mark)
|
| 884 |
+
# We split on . ? or ! followed by space or end of string
|
| 885 |
+
parts = re.split(r'([.?!]\s|[.?!]$)', temp_text)
|
| 886 |
+
|
| 887 |
+
# Re-group content and its trailing separator
|
| 888 |
+
segments = []
|
| 889 |
+
current_segment = ""
|
| 890 |
+
for part in parts:
|
| 891 |
+
if part and (part.strip() in ['.', '?', '!'] or re.match(r'[.?!]\s', part)):
|
| 892 |
+
segments.append(current_segment + part)
|
| 893 |
+
current_segment = ""
|
| 894 |
+
else:
|
| 895 |
+
current_segment += part
|
| 896 |
+
if current_segment:
|
| 897 |
+
segments.append(current_segment)
|
| 898 |
+
|
| 899 |
+
final_segments = []
|
| 900 |
+
for seg in segments:
|
| 901 |
+
# Check if this segment contains a URL placeholder
|
| 902 |
+
if "__URL_PH_" in seg:
|
| 903 |
+
# Entire segment contains a URL, TRUNCATE HERE
|
| 904 |
+
break
|
| 905 |
+
|
| 906 |
+
# Restore placeholders just for this segment to check for venues
|
| 907 |
+
check_seg = seg
|
| 908 |
+
for i, val in enumerate(placeholders):
|
| 909 |
+
check_seg = check_seg.replace(f"__URL_PH_{i}__", val)
|
| 910 |
+
|
| 911 |
+
seg_lower = check_seg.lower()
|
| 912 |
+
found_contamination = False
|
| 913 |
+
|
| 914 |
+
# Check for Venues (Case-Insensitive names, Case-Sensitive abbrs)
|
| 915 |
+
for venue in VENUE_NAMES:
|
| 916 |
+
if venue.lower() in seg_lower:
|
| 917 |
+
found_contamination = True
|
| 918 |
+
break
|
| 919 |
+
|
| 920 |
+
if not found_contamination:
|
| 921 |
+
for abbr in VENUE_ABBREVIATIONS:
|
| 922 |
+
if re.search(r'\b' + re.escape(abbr) + r'\b', check_seg):
|
| 923 |
+
found_contamination = True
|
| 924 |
+
break
|
| 925 |
+
|
| 926 |
+
if not found_contamination:
|
| 927 |
+
for term in COMMON_TERMS:
|
| 928 |
+
if term.lower() in seg_lower:
|
| 929 |
+
found_contamination = True
|
| 930 |
+
break
|
| 931 |
+
|
| 932 |
+
if not found_contamination:
|
| 933 |
+
# Check for Years (19xx-21xx) - Truncate if found
|
| 934 |
+
# User requested to remove segments with years, but NOT all digits
|
| 935 |
+
if re.search(r'\b(19|20|21)\d{2}\b', check_seg):
|
| 936 |
+
found_contamination = True
|
| 937 |
+
|
| 938 |
+
if not found_contamination:
|
| 939 |
+
# Double check for any missed URLs just in case
|
| 940 |
+
if extractor.has_urls(check_seg) or re.search(extra_patterns, check_seg, re.IGNORECASE):
|
| 941 |
+
found_contamination = True
|
| 942 |
+
|
| 943 |
+
if found_contamination:
|
| 944 |
+
# TRUNCATE HERE
|
| 945 |
+
break
|
| 946 |
+
|
| 947 |
+
# Reconstruct the segment with URLs restored
|
| 948 |
+
restored_seg = seg
|
| 949 |
+
for i, val in enumerate(placeholders):
|
| 950 |
+
restored_seg = restored_seg.replace(f"__URL_PH_{i}__", val)
|
| 951 |
+
final_segments.append(restored_seg)
|
| 952 |
+
|
| 953 |
+
# Join remaining segments
|
| 954 |
+
text = "".join(final_segments).strip()
|
| 955 |
+
|
| 956 |
+
# Final cleanup
|
| 957 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 958 |
+
text = re.sub(r'\(\s*\)', '', text)
|
| 959 |
+
text = re.sub(r'\[\s*\]', '', text)
|
| 960 |
+
text = text.strip(".,;: -()[]")
|
| 961 |
+
|
| 962 |
+
return text
|
| 963 |
+
|
| 964 |
+
def find_reference_pages(pdf_path):
|
| 965 |
+
"""Find reference section pages in the PDF and extract their text."""
|
| 966 |
+
doc = fitz.open(pdf_path)
|
| 967 |
+
start_page = None
|
| 968 |
+
end_page = len(doc)
|
| 969 |
+
ref_text = "" # Will store concatenated reference section text
|
| 970 |
+
|
| 971 |
+
# Find the start page
|
| 972 |
+
for page_num, page in enumerate(doc):
|
| 973 |
+
text = page.get_text("text")
|
| 974 |
+
lines = [l.strip().lower() for l in text.splitlines() if l.strip()]
|
| 975 |
+
|
| 976 |
+
for line in lines:
|
| 977 |
+
if line.startswith("reference") or line.startswith("bibliography"):
|
| 978 |
+
start_page = page_num
|
| 979 |
+
break
|
| 980 |
+
|
| 981 |
+
if start_page is not None:
|
| 982 |
+
break
|
| 983 |
+
|
| 984 |
+
|
| 985 |
+
if start_page is not None:
|
| 986 |
+
# Initial guess is JUST the start page.
|
| 987 |
+
# The iterative GROBID pass in extract_citations_auto will expand this.
|
| 988 |
+
end_page = start_page + 1
|
| 989 |
+
ref_pages = [start_page]
|
| 990 |
+
|
| 991 |
+
# Extract text for visibility (just the first page for now)
|
| 992 |
+
ref_text = doc[start_page].get_text("text") + "\n"
|
| 993 |
+
else:
|
| 994 |
+
ref_pages = []
|
| 995 |
+
|
| 996 |
+
doc.close()
|
| 997 |
+
return ref_pages, start_page, end_page, ref_text
|
| 998 |
+
|
| 999 |
+
def process_pdf_initial(pdf_file, state_pdf_path, state_ref_pages, state_citations, state_removed_citations, state_appendix_header, state_ref_text):
|
| 1000 |
+
"""Initial PDF processing - find references and show PDF immediately."""
|
| 1001 |
+
# Clean up old temp files whenever a new PDF is uploaded
|
| 1002 |
+
cleanup_old_temp_files(max_age_hours=1)
|
| 1003 |
+
|
| 1004 |
+
if pdf_file is None:
|
| 1005 |
+
return (None, "No PDF uploaded",
|
| 1006 |
+
gr.update(visible=False), gr.update(visible=False),
|
| 1007 |
+
gr.update(visible=False),
|
| 1008 |
+
gr.update(interactive=False, visible=False),
|
| 1009 |
+
gr.update(interactive=False, visible=False),
|
| 1010 |
+
None, [], [], [], None, "",
|
| 1011 |
+
gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
|
| 1012 |
+
False,
|
| 1013 |
+
gr.update(visible=False),
|
| 1014 |
+
None, # reset state_ref_pdf_path
|
| 1015 |
+
"", # reset state_pdf_name
|
| 1016 |
+
gr.update(visible=False), # reset export_btn
|
| 1017 |
+
gr.update(visible=False)) # reset download_file
|
| 1018 |
+
|
| 1019 |
+
new_pdf_path = pdf_file.name
|
| 1020 |
+
new_citations = []
|
| 1021 |
+
new_removed_citations = []
|
| 1022 |
+
|
| 1023 |
+
# Find reference pages
|
| 1024 |
+
new_ref_pages, start_page, end_page, new_ref_text = find_reference_pages(new_pdf_path)
|
| 1025 |
+
new_appendix_header = None # Initialize empty logic for iterative detection
|
| 1026 |
+
|
| 1027 |
+
# Initial status log
|
| 1028 |
+
status = f"✓ Loaded PDF: {os.path.basename(new_pdf_path)}\n"
|
| 1029 |
+
|
| 1030 |
+
if new_ref_pages:
|
| 1031 |
+
status += f"\n✓ Identified reference section start: page {start_page + 1}"
|
| 1032 |
+
else:
|
| 1033 |
+
status += "\n⚠ No reference section found"
|
| 1034 |
+
|
| 1035 |
+
status += "\n⏳ Starting automatic extraction..."
|
| 1036 |
+
|
| 1037 |
+
basename = os.path.basename(new_pdf_path)
|
| 1038 |
+
# Return immediately - show PDF right away, extraction starts automatically via event chain
|
| 1039 |
+
return (new_pdf_path, status,
|
| 1040 |
+
gr.update(value=new_pdf_path, visible=True),
|
| 1041 |
+
gr.update(visible=True, value="Show Full PDF"),
|
| 1042 |
+
gr.update(visible=False), # Citations display
|
| 1043 |
+
gr.update(interactive=False, visible=False), # Verify Button
|
| 1044 |
+
gr.update(interactive=False, visible=False), # Slider
|
| 1045 |
+
new_pdf_path, new_ref_pages, new_citations, new_removed_citations, new_appendix_header, new_ref_text,
|
| 1046 |
+
gr.update(visible=False), # citations_header
|
| 1047 |
+
gr.update(visible=False), # verification_header
|
| 1048 |
+
gr.update(visible=False), # verification_divider
|
| 1049 |
+
gr.update(visible=False), # api_key_input
|
| 1050 |
+
False, # state_extraction_done
|
| 1051 |
+
gr.update(visible=False, value=""), # corrected_display cleared completely
|
| 1052 |
+
None, # reset state_ref_pdf_path
|
| 1053 |
+
basename, # state_pdf_name
|
| 1054 |
+
gr.update(visible=False), # export_btn
|
| 1055 |
+
gr.update(visible=False, value=None)) # download_file
|
| 1056 |
+
|
| 1057 |
+
def _get_grobid_boundaries(pdf_path, page_indices):
|
| 1058 |
+
"""Helper to get GROBID citation boundaries for specific pages."""
|
| 1059 |
+
if not page_indices:
|
| 1060 |
+
return []
|
| 1061 |
+
|
| 1062 |
+
output_path = None
|
| 1063 |
+
try:
|
| 1064 |
+
doc = fitz.open(pdf_path)
|
| 1065 |
+
temp_grobid = tempfile.NamedTemporaryFile(delete=False, suffix="_grobid.pdf")
|
| 1066 |
+
output_path = temp_grobid.name
|
| 1067 |
+
temp_grobid.close()
|
| 1068 |
+
|
| 1069 |
+
ref_doc = fitz.open()
|
| 1070 |
+
for page_idx in page_indices:
|
| 1071 |
+
ref_doc.insert_pdf(doc, from_page=page_idx, to_page=page_idx)
|
| 1072 |
+
|
| 1073 |
+
ref_doc.save(output_path, garbage=4, deflate=True, clean=True, expand=True)
|
| 1074 |
+
ref_doc.close()
|
| 1075 |
+
doc.close()
|
| 1076 |
+
|
| 1077 |
+
with open(output_path, 'rb') as f:
|
| 1078 |
+
files = {'input': (os.path.basename(output_path), f, 'application/pdf')}
|
| 1079 |
+
data = {'consolidateCitations': '0', 'includeRawCitations': '1'}
|
| 1080 |
+
response = requests.post(
|
| 1081 |
+
'http://localhost:8070/api/processFulltextDocument',
|
| 1082 |
+
files=files,
|
| 1083 |
+
data=data,
|
| 1084 |
+
timeout=120
|
| 1085 |
+
)
|
| 1086 |
+
|
| 1087 |
+
if response.status_code == 200:
|
| 1088 |
+
return parse_tei_citations(response.text)
|
| 1089 |
+
else:
|
| 1090 |
+
return []
|
| 1091 |
+
except Exception:
|
| 1092 |
+
return []
|
| 1093 |
+
finally:
|
| 1094 |
+
if output_path and os.path.exists(output_path):
|
| 1095 |
+
try:
|
| 1096 |
+
os.unlink(output_path)
|
| 1097 |
+
except:
|
| 1098 |
+
pass
|
| 1099 |
+
|
| 1100 |
+
def extract_citations_auto(view_mode, previous_status, state_pdf_path, state_ref_pages, state_ref_text, state_citations, state_removed_citations, state_appendix_header, state_extraction_done):
|
| 1101 |
+
"""Extract citations using triple-pass hybrid pipeline to improve recall."""
|
| 1102 |
+
|
| 1103 |
+
# Helper for intermediate updates
|
| 1104 |
+
def gen_update(status_txt, done=False, final_cits=[], final_rem=[], final_pages=None, final_text=None, final_header=None):
|
| 1105 |
+
# Use current state or provided finals
|
| 1106 |
+
cits = final_cits if final_cits is not None else state_citations
|
| 1107 |
+
rem = final_rem if final_rem is not None else state_removed_citations
|
| 1108 |
+
pages = final_pages if final_pages is not None else state_ref_pages
|
| 1109 |
+
text = final_text if final_text is not None else state_ref_text
|
| 1110 |
+
header = final_header if final_header is not None else state_appendix_header
|
| 1111 |
+
|
| 1112 |
+
loading_update = gr.update(visible=False) if done else gr.update()
|
| 1113 |
+
|
| 1114 |
+
verify_vis = done
|
| 1115 |
+
slider_vis = done
|
| 1116 |
+
headers_vis = done
|
| 1117 |
+
|
| 1118 |
+
slider_max = len(cits) if cits else 1
|
| 1119 |
+
slider_val = min(1, slider_max)
|
| 1120 |
+
|
| 1121 |
+
# Logic to pre-generate Citation HTML when done
|
| 1122 |
+
citations_html_update = gr.update(visible=headers_vis)
|
| 1123 |
+
if done:
|
| 1124 |
+
display_text = format_citations_display(cits)
|
| 1125 |
+
if rem:
|
| 1126 |
+
display_text += "\n\nREMOVED CITATIONS ({})\n\n".format(len(rem))
|
| 1127 |
+
display_text += format_citations_display(rem, show_reason=True)
|
| 1128 |
+
citations_html_update = gr.update(value=display_text, visible=headers_vis)
|
| 1129 |
+
else:
|
| 1130 |
+
citations_html_update = gr.update(visible=headers_vis) if done else gr.update()
|
| 1131 |
+
|
| 1132 |
+
return (status_txt,
|
| 1133 |
+
citations_html_update, # citations_display (Populated when done)
|
| 1134 |
+
gr.update(interactive=verify_vis, visible=verify_vis), # verify_btn
|
| 1135 |
+
gr.update(interactive=slider_vis, maximum=slider_max, value=slider_val, visible=slider_vis), # slider
|
| 1136 |
+
cits, rem, pages, text, header,
|
| 1137 |
+
gr.update(), # pdf_viewer (handled by update_view, we just update state)
|
| 1138 |
+
loading_update, # Loading Indicator
|
| 1139 |
+
gr.update(visible=headers_vis), # citations_header
|
| 1140 |
+
gr.update(visible=headers_vis), # verification_header
|
| 1141 |
+
gr.update(visible=headers_vis), # verification_divider
|
| 1142 |
+
gr.update(visible=headers_vis), # api_key_input
|
| 1143 |
+
done, # state_extraction_done
|
| 1144 |
+
gr.update(visible=headers_vis), # corrected_display
|
| 1145 |
+
gr.update(visible=done), # export_btn
|
| 1146 |
+
gr.update(visible=False, value=None)) # download_file
|
| 1147 |
+
|
| 1148 |
+
if not state_ref_pages or not state_pdf_path:
|
| 1149 |
+
yield gen_update(previous_status + "\n⚠ No reference pages to process", done=True)
|
| 1150 |
+
return
|
| 1151 |
+
|
| 1152 |
+
try:
|
| 1153 |
+
start_page_idx = state_ref_pages[0]
|
| 1154 |
+
confirmed_ref_pages = []
|
| 1155 |
+
per_page_citations = []
|
| 1156 |
+
|
| 1157 |
+
yield gen_update(previous_status + f"\n⏳ Scanning pages starting from {start_page_idx + 1}...")
|
| 1158 |
+
|
| 1159 |
+
doc_temp = fitz.open(state_pdf_path)
|
| 1160 |
+
total_pages = len(doc_temp)
|
| 1161 |
+
doc_temp.close()
|
| 1162 |
+
|
| 1163 |
+
current_page = start_page_idx
|
| 1164 |
+
|
| 1165 |
+
while current_page < total_pages:
|
| 1166 |
+
yield gen_update(previous_status + f"\n⏳ Scanning Page {current_page + 1}...")
|
| 1167 |
+
|
| 1168 |
+
page_cits = _get_grobid_boundaries(state_pdf_path, [current_page])
|
| 1169 |
+
|
| 1170 |
+
valid_count = 0
|
| 1171 |
+
for c in page_cits:
|
| 1172 |
+
if c.get('title') or c.get('authors') or c.get('year'):
|
| 1173 |
+
valid_count += 1
|
| 1174 |
+
|
| 1175 |
+
if valid_count == 0:
|
| 1176 |
+
break
|
| 1177 |
+
else:
|
| 1178 |
+
confirmed_ref_pages.append(current_page)
|
| 1179 |
+
per_page_citations.append(page_cits)
|
| 1180 |
+
current_page += 1
|
| 1181 |
+
|
| 1182 |
+
if not confirmed_ref_pages:
|
| 1183 |
+
yield gen_update(previous_status + "\n⚠ No valid citations extracted from start page.", done=True)
|
| 1184 |
+
return
|
| 1185 |
+
|
| 1186 |
+
yield gen_update(previous_status + f"\n✓ Range confirmed: {confirmed_ref_pages[0]+1}-{confirmed_ref_pages[-1]+1}. Merging...", final_pages=confirmed_ref_pages)
|
| 1187 |
+
|
| 1188 |
+
# Update status log with the confirmed range
|
| 1189 |
+
status_update = f"\n✓ Confirmed Reference Range: Pages {confirmed_ref_pages[0]+1}-{confirmed_ref_pages[-1]+1} ({len(confirmed_ref_pages)} pages)"
|
| 1190 |
+
previous_status += status_update
|
| 1191 |
+
|
| 1192 |
+
state_ref_pages = confirmed_ref_pages
|
| 1193 |
+
|
| 1194 |
+
# Re-extract text for the full confirmed range
|
| 1195 |
+
updated_ref_text = ""
|
| 1196 |
+
doc_temp = fitz.open(state_pdf_path)
|
| 1197 |
+
for p_idx in state_ref_pages:
|
| 1198 |
+
updated_ref_text += doc_temp[p_idx].get_text("text") + "\n"
|
| 1199 |
+
|
| 1200 |
+
# --- DYNAMIC HEADER DETECTION ---
|
| 1201 |
+
last_page_text = doc_temp[state_ref_pages[-1]].get_text("text")
|
| 1202 |
+
lines = [l.strip() for l in last_page_text.splitlines() if l.strip()]
|
| 1203 |
+
|
| 1204 |
+
appendix_keywords = ["appendix", "appendices", "supplement", "limitation", "checklist", "statement"]
|
| 1205 |
+
|
| 1206 |
+
last_page_citations = per_page_citations[-1]
|
| 1207 |
+
citation_start_line_indices = []
|
| 1208 |
+
for cit in last_page_citations:
|
| 1209 |
+
cit_text = cit.get('raw_text', '').strip()
|
| 1210 |
+
if not cit_text: continue
|
| 1211 |
+
cit_prefix = cit_text[:30].strip().lower()
|
| 1212 |
+
for k, line in enumerate(lines):
|
| 1213 |
+
if cit_prefix in line.lower():
|
| 1214 |
+
citation_start_line_indices.append(k)
|
| 1215 |
+
break
|
| 1216 |
+
|
| 1217 |
+
header_candidates = []
|
| 1218 |
+
for i, line in enumerate(lines):
|
| 1219 |
+
line_lower = line.lower()
|
| 1220 |
+
if len(line.split()) <= 5:
|
| 1221 |
+
is_match = False
|
| 1222 |
+
if any(k in line_lower for k in appendix_keywords):
|
| 1223 |
+
is_match = True
|
| 1224 |
+
elif re.match(r'^A[\.\:]?$', line.split()[0] if line.split() else ""):
|
| 1225 |
+
is_match = True
|
| 1226 |
+
|
| 1227 |
+
if is_match:
|
| 1228 |
+
candidate = line
|
| 1229 |
+
curr_idx = i + 1
|
| 1230 |
+
while len(candidate) < 5 and curr_idx < len(lines):
|
| 1231 |
+
candidate += " " + lines[curr_idx]
|
| 1232 |
+
curr_idx += 1
|
| 1233 |
+
|
| 1234 |
+
has_citations_after = any(start_idx > i for start_idx in citation_start_line_indices)
|
| 1235 |
+
if not has_citations_after:
|
| 1236 |
+
header_candidates.append(candidate)
|
| 1237 |
+
|
| 1238 |
+
if header_candidates:
|
| 1239 |
+
found_header = header_candidates[0]
|
| 1240 |
+
state_appendix_header = found_header
|
| 1241 |
+
else:
|
| 1242 |
+
state_appendix_header = None
|
| 1243 |
+
|
| 1244 |
+
doc_temp.close()
|
| 1245 |
+
state_ref_text = updated_ref_text
|
| 1246 |
+
|
| 1247 |
+
# 2. Get Consolidated List (LIST C)
|
| 1248 |
+
yield gen_update(previous_status + "\n⏳ Sending full context to GROBID...", final_pages=state_ref_pages, final_text=state_ref_text, final_header=state_appendix_header)
|
| 1249 |
+
grobid_citations_a = _get_grobid_boundaries(state_pdf_path, confirmed_ref_pages)
|
| 1250 |
+
|
| 1251 |
+
# 3. Span Detection & Merging
|
| 1252 |
+
import difflib
|
| 1253 |
+
list_i_pages = per_page_citations
|
| 1254 |
+
list_c = grobid_citations_a
|
| 1255 |
+
|
| 1256 |
+
def get_text(cit):
|
| 1257 |
+
return cit.get('raw_text', '').strip()
|
| 1258 |
+
|
| 1259 |
+
refined_list_i = []
|
| 1260 |
+
actions = {}
|
| 1261 |
+
|
| 1262 |
+
for p_idx in range(len(list_i_pages)):
|
| 1263 |
+
current_page = list_i_pages[p_idx]
|
| 1264 |
+
if not current_page: continue
|
| 1265 |
+
|
| 1266 |
+
cit_x = current_page[-1]
|
| 1267 |
+
cit_x_text = get_text(cit_x)
|
| 1268 |
+
|
| 1269 |
+
cit_y = None
|
| 1270 |
+
cit_y_text = ""
|
| 1271 |
+
cit_z = None
|
| 1272 |
+
cit_z_text = ""
|
| 1273 |
+
|
| 1274 |
+
if p_idx + 1 < len(list_i_pages) and list_i_pages[p_idx+1]:
|
| 1275 |
+
cit_y = list_i_pages[p_idx+1][0]
|
| 1276 |
+
cit_y_text = get_text(cit_y)
|
| 1277 |
+
if len(list_i_pages[p_idx+1]) > 1:
|
| 1278 |
+
cit_z = list_i_pages[p_idx+1][1]
|
| 1279 |
+
cit_z_text = get_text(cit_z)
|
| 1280 |
+
|
| 1281 |
+
matches = []
|
| 1282 |
+
for c_item in list_c:
|
| 1283 |
+
c_text = get_text(c_item)
|
| 1284 |
+
if cit_x_text in c_text:
|
| 1285 |
+
matches.append(c_item)
|
| 1286 |
+
|
| 1287 |
+
best_action = None
|
| 1288 |
+
for cit_match in matches:
|
| 1289 |
+
match_text = get_text(cit_match)
|
| 1290 |
+
if cit_z and cit_z_text in match_text: continue
|
| 1291 |
+
if cit_y and cit_y_text in match_text: continue
|
| 1292 |
+
|
| 1293 |
+
if len(match_text) > len(cit_x_text):
|
| 1294 |
+
best_action = {'type': 'extension', 'target': cit_match}
|
| 1295 |
+
break
|
| 1296 |
+
|
| 1297 |
+
if best_action:
|
| 1298 |
+
actions[id(cit_x)] = best_action
|
| 1299 |
+
|
| 1300 |
+
flat_list_i = []
|
| 1301 |
+
skip_ids = set()
|
| 1302 |
+
for p_list in list_i_pages:
|
| 1303 |
+
for cit in p_list:
|
| 1304 |
+
if id(cit) in skip_ids: continue
|
| 1305 |
+
if id(cit) in actions:
|
| 1306 |
+
act = actions[id(cit)]
|
| 1307 |
+
if act['type'] == 'extension':
|
| 1308 |
+
flat_list_i.append(act['target'])
|
| 1309 |
+
else:
|
| 1310 |
+
flat_list_i.append(cit)
|
| 1311 |
+
|
| 1312 |
+
texts_i = [get_text(c) for c in flat_list_i]
|
| 1313 |
+
texts_c = [get_text(c) for c in list_c]
|
| 1314 |
+
matcher = difflib.SequenceMatcher(None, texts_i, texts_c)
|
| 1315 |
+
final_merged_list = []
|
| 1316 |
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
| 1317 |
+
if tag == 'equal': final_merged_list.extend(flat_list_i[i1:i2])
|
| 1318 |
+
elif tag == 'delete': final_merged_list.extend(flat_list_i[i1:i2])
|
| 1319 |
+
elif tag == 'insert': final_merged_list.extend(list_c[j1:j2])
|
| 1320 |
+
elif tag == 'replace': final_merged_list.extend(flat_list_i[i1:i2])
|
| 1321 |
+
|
| 1322 |
+
grobid_citations = final_merged_list
|
| 1323 |
+
|
| 1324 |
+
merged_citations = []
|
| 1325 |
+
for cit in grobid_citations:
|
| 1326 |
+
raw_text = cit.get('raw_text', '').strip()
|
| 1327 |
+
has_url = extractor.has_urls(raw_text) or re.search(r'arxiv\.org|doi\.org|\bdoi:|\burl\b', raw_text, re.IGNORECASE)
|
| 1328 |
+
is_url_only = has_url and len(raw_text.split()) <= 6
|
| 1329 |
+
|
| 1330 |
+
if merged_citations and is_url_only:
|
| 1331 |
+
prev_cit = merged_citations[-1]
|
| 1332 |
+
prev_cit['raw_text'] = (prev_cit.get('raw_text', '') + " " + raw_text).strip()
|
| 1333 |
+
else:
|
| 1334 |
+
merged_citations.append(cit)
|
| 1335 |
+
|
| 1336 |
+
grobid_citations = merged_citations
|
| 1337 |
+
|
| 1338 |
+
yield gen_update(previous_status + f"\n⏳ Parsing metadata for {len(grobid_citations)} citations...", final_pages=state_ref_pages, final_text=state_ref_text, final_header=state_appendix_header)
|
| 1339 |
+
|
| 1340 |
+
# Stage 2: Extract title and authors
|
| 1341 |
+
parsed_citations = []
|
| 1342 |
+
|
| 1343 |
+
for idx, cit in enumerate(grobid_citations):
|
| 1344 |
+
# Frequent yields during heavy parsing loop (every 5)
|
| 1345 |
+
if idx % 5 == 0:
|
| 1346 |
+
yield gen_update(previous_status + f"\n⏳ Parsing citation {idx+1}/{len(grobid_citations)}...", final_pages=state_ref_pages, final_text=state_ref_text, final_header=state_appendix_header)
|
| 1347 |
+
|
| 1348 |
+
raw_text = cit.get('raw_text', '')
|
| 1349 |
+
grobid_xml = cit.get('grobid_xml', '')
|
| 1350 |
+
|
| 1351 |
+
if idx == len(grobid_citations) - 1 and state_appendix_header:
|
| 1352 |
+
clean_header = state_appendix_header.strip()[:10].strip().lower()
|
| 1353 |
+
clean_header = re.sub(r'\s+', ' ', clean_header)
|
| 1354 |
+
raw_lower = re.sub(r'\s+', ' ', raw_text.lower())
|
| 1355 |
+
cutoff_index = raw_lower.find(clean_header)
|
| 1356 |
+
if cutoff_index > 0:
|
| 1357 |
+
cleaned_raw_reference = raw_text[:cutoff_index].strip()
|
| 1358 |
+
cleaned_raw_reference = re.sub(r'(\.\s*See\s*|\s*See\s*|\.\s*)$', '', cleaned_raw_reference, flags=re.IGNORECASE).strip()
|
| 1359 |
+
raw_text = cleaned_raw_reference
|
| 1360 |
+
try:
|
| 1361 |
+
response = requests.post(
|
| 1362 |
+
'http://localhost:8070/api/processCitation',
|
| 1363 |
+
data={'citations': cleaned_raw_reference, 'includeRawCitations': '1'},
|
| 1364 |
+
timeout=30
|
| 1365 |
+
)
|
| 1366 |
+
if response.status_code == 200:
|
| 1367 |
+
grobid_xml = response.text
|
| 1368 |
+
raw_text = cleaned_raw_reference
|
| 1369 |
+
except Exception:
|
| 1370 |
+
pass
|
| 1371 |
+
|
| 1372 |
+
parsed_fields = extract_title_and_authors_from_xml(grobid_xml)
|
| 1373 |
+
title = parsed_fields.get('title', '')
|
| 1374 |
+
authors = parsed_fields.get('authors', [])
|
| 1375 |
+
|
| 1376 |
+
raw_text = raw_text.replace("- ", "")
|
| 1377 |
+
title = title.replace("- ", "")
|
| 1378 |
+
|
| 1379 |
+
if title and len(title) > 5:
|
| 1380 |
+
clean_title_prefix = re.sub(r'\W+', '', title.lower()[:40])
|
| 1381 |
+
if clean_title_prefix:
|
| 1382 |
+
pattern_parts = [re.escape(c) + r'[\W]*' for c in clean_title_prefix]
|
| 1383 |
+
fuzzy_pattern = r''.join(pattern_parts)
|
| 1384 |
+
raw_lower = raw_text.lower()
|
| 1385 |
+
t_match = re.search(fuzzy_pattern, raw_lower)
|
| 1386 |
+
if t_match:
|
| 1387 |
+
match_start = t_match.start()
|
| 1388 |
+
prev_dot = raw_text.rfind('.', 0, match_start)
|
| 1389 |
+
prev_q = raw_text.rfind('?', 0, match_start)
|
| 1390 |
+
prev_ex = raw_text.rfind('!', 0, match_start)
|
| 1391 |
+
prev_comma = raw_text.rfind(',', 0, match_start)
|
| 1392 |
+
boundary_idx = max(prev_dot, prev_q, prev_ex, prev_comma)
|
| 1393 |
+
start_idx = boundary_idx + 1 if boundary_idx != -1 else 0
|
| 1394 |
+
missed_prefix = raw_text[start_idx:match_start].strip()
|
| 1395 |
+
if missed_prefix:
|
| 1396 |
+
title = f"{missed_prefix} {title}".strip()
|
| 1397 |
+
|
| 1398 |
+
title = clean_metadata(title)
|
| 1399 |
+
|
| 1400 |
+
refined_authors = refine_author_string(raw_text, authors, title)
|
| 1401 |
+
refined_authors = clean_metadata(refined_authors)
|
| 1402 |
+
|
| 1403 |
+
if title and len(title) > 8:
|
| 1404 |
+
if title in refined_authors:
|
| 1405 |
+
refined_authors = refined_authors.split(title)[0].strip()
|
| 1406 |
+
|
| 1407 |
+
refined_authors = refined_authors.strip(".,;: -()")
|
| 1408 |
+
|
| 1409 |
+
citation = {
|
| 1410 |
+
'raw_text': raw_text,
|
| 1411 |
+
'title': title,
|
| 1412 |
+
'authors': refined_authors,
|
| 1413 |
+
'year': cit.get('year', ''),
|
| 1414 |
+
'venue': cit.get('venue', '')
|
| 1415 |
+
}
|
| 1416 |
+
parsed_citations.append(citation)
|
| 1417 |
+
|
| 1418 |
+
final_citations = []
|
| 1419 |
+
final_removed_citations = []
|
| 1420 |
+
|
| 1421 |
+
for cit in parsed_citations:
|
| 1422 |
+
title = cit.get('title', '').strip()
|
| 1423 |
+
rejection_reason = None
|
| 1424 |
+
raw_text_clean = cit.get('raw_text', '').strip()
|
| 1425 |
+
alpha_chars = sum(c.isalnum() for c in raw_text_clean)
|
| 1426 |
+
alpha_density = alpha_chars / len(raw_text_clean) if raw_text_clean else 0
|
| 1427 |
+
|
| 1428 |
+
if title.lower().startswith("fig.") or title.lower().startswith("figure"): rejection_reason = "Figure caption detected"
|
| 1429 |
+
elif not title and not cit.get('authors') and not cit.get('year'): rejection_reason = "Missing title, authors, and year"
|
| 1430 |
+
elif raw_text_clean.lower() in ["references", "bibliography", "works cited"]: rejection_reason = "Section header detected"
|
| 1431 |
+
elif len(raw_text_clean) > 5 and alpha_density < 0.3: rejection_reason = "Likely noise or artifact (low text density)"
|
| 1432 |
+
|
| 1433 |
+
if rejection_reason:
|
| 1434 |
+
cit['rejection_reason'] = rejection_reason
|
| 1435 |
+
final_removed_citations.append(cit)
|
| 1436 |
+
continue
|
| 1437 |
+
|
| 1438 |
+
is_dup = False
|
| 1439 |
+
for existing in final_citations:
|
| 1440 |
+
existing_text = existing.get('raw_text', '').strip()
|
| 1441 |
+
if jellyfish.jaro_winkler_similarity(raw_text_clean, existing_text) >= 0.95:
|
| 1442 |
+
is_dup = True
|
| 1443 |
+
break
|
| 1444 |
+
|
| 1445 |
+
if not is_dup: final_citations.append(cit)
|
| 1446 |
+
else:
|
| 1447 |
+
cit['rejection_reason'] = "Duplicate (95%+ similarity)"
|
| 1448 |
+
final_removed_citations.append(cit)
|
| 1449 |
+
|
| 1450 |
+
status = previous_status + f"\n✓ Hybrid extraction: {len(final_citations)} citations (+{len(final_removed_citations)} filtered)"
|
| 1451 |
+
|
| 1452 |
+
# FINAL YIELD
|
| 1453 |
+
yield gen_update(status, done=True, final_cits=final_citations, final_rem=final_removed_citations, final_pages=state_ref_pages, final_text=state_ref_text, final_header=state_appendix_header)
|
| 1454 |
+
|
| 1455 |
+
except Exception as e:
|
| 1456 |
+
# Error Update
|
| 1457 |
+
yield gen_update(previous_status + f"\n❌ Error: {str(e)}", done=True, final_cits=[], final_rem=[])
|
| 1458 |
+
|
| 1459 |
+
def run_citation_check(num_to_check, previous_status, api_key, state_citations):
|
| 1460 |
+
"""Run citation check with per-user state."""
|
| 1461 |
+
|
| 1462 |
+
if not state_citations:
|
| 1463 |
+
# Match the multi-output signature: [status_text, corrected_display, state_citations]
|
| 1464 |
+
yield (previous_status + "\n⚠ No citations to verify.",
|
| 1465 |
+
gr.update(), state_citations)
|
| 1466 |
+
return
|
| 1467 |
+
|
| 1468 |
+
# 1. Identify Author Pattern from the top 10 citations
|
| 1469 |
+
sample_author_strings = [cit.get('authors', '') for cit in state_citations[:10] if cit.get('authors') and isinstance(cit.get('authors'), str)]
|
| 1470 |
+
name_order, separator = identify_author_pattern(sample_author_strings)
|
| 1471 |
+
|
| 1472 |
+
# Identifies pattern, then creates work list
|
| 1473 |
+
import copy
|
| 1474 |
+
to_check = copy.deepcopy(state_citations[:num_to_check])
|
| 1475 |
+
|
| 1476 |
+
# Use API key if provided
|
| 1477 |
+
api_key_clean = api_key.strip() if api_key else None
|
| 1478 |
+
|
| 1479 |
+
# Process
|
| 1480 |
+
updated_citations = list(state_citations)
|
| 1481 |
+
total = len(to_check)
|
| 1482 |
+
|
| 1483 |
+
# Iterate through the generator to process citations
|
| 1484 |
+
for i, verified_cit in enumerate(check_citations_semantic_scholar(to_check, api_key=api_key_clean, name_order=name_order, separator=separator)):
|
| 1485 |
+
# Update the citation in the list
|
| 1486 |
+
if i < len(updated_citations):
|
| 1487 |
+
updated_citations[i] = verified_cit
|
| 1488 |
+
|
| 1489 |
+
# Yield status update to show progress
|
| 1490 |
+
# We also yield the updated citations display so "Show Citations" reflects progress
|
| 1491 |
+
status_msg = f"{previous_status}\n⏳ Verifying citation {i+1}/{total}..."
|
| 1492 |
+
updated_cit_html = format_citations_display(updated_citations)
|
| 1493 |
+
yield (status_msg, gr.update(), updated_cit_html, updated_citations)
|
| 1494 |
+
|
| 1495 |
+
# Final return with final view
|
| 1496 |
+
final_ver_html = format_verifications_display(updated_citations)
|
| 1497 |
+
final_cit_html = format_citations_display(updated_citations)
|
| 1498 |
+
v_count = sum(1 for c in updated_citations[:total] if c.get('verification', {}).get('status') == 'verified')
|
| 1499 |
+
a_count = sum(1 for c in updated_citations[:total] if c.get('verification', {}).get('status') == 'ambiguous')
|
| 1500 |
+
h_count = sum(1 for c in updated_citations[:total] if c.get('verification', {}).get('status') == 'suspected_hallucination')
|
| 1501 |
+
e_count = sum(1 for c in updated_citations[:total] if c.get('verification', {}).get('status') == 'api_error')
|
| 1502 |
+
status_msg = f"Verification Complete: ✅ {v_count} | ⚠️ {a_count} | ❌ {h_count} | 🔌 {e_count}"
|
| 1503 |
+
|
| 1504 |
+
yield (status_msg, final_ver_html, final_cit_html, updated_citations)
|
| 1505 |
+
|
| 1506 |
+
def format_citations_display(citations, show_reason=False):
|
| 1507 |
+
"""Format citations for display as HTML."""
|
| 1508 |
+
if not citations:
|
| 1509 |
+
return ""
|
| 1510 |
+
|
| 1511 |
+
import html as html_lib
|
| 1512 |
+
|
| 1513 |
+
html_output = "<div style='font-family: sans-serif; font-size: 14px; line-height: 1.5; color: #333; max-height: 600px; overflow-y: auto; padding: 12px; border: 1px solid #e0e0e0; border-radius: 4px; background-color: #ffffff;'>"
|
| 1514 |
+
|
| 1515 |
+
for i, cit in enumerate(citations, 1):
|
| 1516 |
+
# Display the raw_text directly
|
| 1517 |
+
raw_text = cit.get('raw_text', 'No citation text')
|
| 1518 |
+
safe_raw = html_lib.escape(raw_text)
|
| 1519 |
+
|
| 1520 |
+
cit_block = f"<div style='margin-bottom: 16px; padding-bottom: 8px; border-bottom: 1px solid #eee;'>"
|
| 1521 |
+
cit_block += f"<div><strong>[{i}]</strong> {safe_raw}"
|
| 1522 |
+
|
| 1523 |
+
if show_reason and 'rejection_reason' in cit:
|
| 1524 |
+
reason = html_lib.escape(cit['rejection_reason'])
|
| 1525 |
+
cit_block += f" <span style='color: #d32f2f; font-weight: bold; margin-left: 8px;'>[REASON: {reason}]</span>"
|
| 1526 |
+
|
| 1527 |
+
cit_block += "</div>"
|
| 1528 |
+
|
| 1529 |
+
# Add Extracted Fields indented for visibility - Styled in Gray
|
| 1530 |
+
title = cit.get('title', '')
|
| 1531 |
+
if title:
|
| 1532 |
+
cit_block += "<div style='color: #757575; margin-left: 24px; font-size: 0.95em; margin-top: 4px;'>"
|
| 1533 |
+
safe_title = html_lib.escape(title)
|
| 1534 |
+
cit_block += f"<div style='margin-bottom: 2px;'>Title: {safe_title}</div>"
|
| 1535 |
+
cit_block += "</div>"
|
| 1536 |
+
|
| 1537 |
+
# Add "After Verification" fields if present (from discovery mapping)
|
| 1538 |
+
title_after = cit.get('title_after_verification', '')
|
| 1539 |
+
authors_after = cit.get('authors_after_verification', '')
|
| 1540 |
+
|
| 1541 |
+
if title_after or authors_after:
|
| 1542 |
+
cit_block += "<div style='color: #2e7d32; margin-left: 24px; font-size: 0.95em; margin-top: 6px; padding: 4px; background-color: #f1f8e9; border-left: 3px solid #8bc34a;'>"
|
| 1543 |
+
if title_after:
|
| 1544 |
+
safe_title_after = html_lib.escape(title_after)
|
| 1545 |
+
cit_block += f"<div style='margin-bottom: 2px;'><strong>Title:</strong> {safe_title_after}</div>"
|
| 1546 |
+
if authors_after:
|
| 1547 |
+
if isinstance(authors_after, list):
|
| 1548 |
+
auth_str_after = ", ".join(authors_after)
|
| 1549 |
+
else:
|
| 1550 |
+
auth_str_after = str(authors_after)
|
| 1551 |
+
safe_authors_after = html_lib.escape(auth_str_after)
|
| 1552 |
+
cit_block += f"<div><strong>Authors:</strong> {safe_authors_after}</div>"
|
| 1553 |
+
cit_block += "</div>"
|
| 1554 |
+
|
| 1555 |
+
cit_block += "</div>"
|
| 1556 |
+
html_output += cit_block
|
| 1557 |
+
|
| 1558 |
+
html_output += "</div>"
|
| 1559 |
+
return html_output
|
| 1560 |
+
|
| 1561 |
+
def refine_author_string(raw_text, grobid_authors, title=None):
|
| 1562 |
+
"""
|
| 1563 |
+
Simplified Author Extraction:
|
| 1564 |
+
Starts at index 0 and extracts up until the segment (separated by period or comma)
|
| 1565 |
+
that contains a 4-digit Year or the Title.
|
| 1566 |
+
"""
|
| 1567 |
+
if not raw_text:
|
| 1568 |
+
return ""
|
| 1569 |
+
|
| 1570 |
+
raw_lower = raw_text.lower()
|
| 1571 |
+
|
| 1572 |
+
# 1. Identify "Metadata Start" candidates (Year or Title)
|
| 1573 |
+
possible_starts = []
|
| 1574 |
+
|
| 1575 |
+
# Candidate A: Year (19xx, 20xx, 21xx)
|
| 1576 |
+
year_match = re.search(r'\b(19|20|21)\d{2}\b', raw_text)
|
| 1577 |
+
if year_match:
|
| 1578 |
+
possible_starts.append(year_match.start())
|
| 1579 |
+
|
| 1580 |
+
# Candidate B: Title (fuzzy-matched prefix)
|
| 1581 |
+
if title and len(title) > 5:
|
| 1582 |
+
# Match the first substantial chunk of the title
|
| 1583 |
+
clean_title_prefix = re.sub(r'\W+', '', title.lower()[:20])
|
| 1584 |
+
if clean_title_prefix:
|
| 1585 |
+
pattern_parts = [re.escape(c) + r'[\W]*' for c in clean_title_prefix]
|
| 1586 |
+
fuzzy_pattern = r''.join(pattern_parts)
|
| 1587 |
+
t_match = re.search(fuzzy_pattern, raw_lower)
|
| 1588 |
+
if t_match:
|
| 1589 |
+
possible_starts.append(t_match.start())
|
| 1590 |
+
|
| 1591 |
+
# 2. Determine the earliest metadata point
|
| 1592 |
+
if not possible_starts:
|
| 1593 |
+
# Fallback: keep the full text and let clean_metadata handle it later
|
| 1594 |
+
return raw_text.strip()
|
| 1595 |
+
|
| 1596 |
+
metadata_begin = min(possible_starts)
|
| 1597 |
+
|
| 1598 |
+
# 3. Handle the "Discard entire segment containing metadata" rule
|
| 1599 |
+
# We find the nearest period or comma BEFORE the metadata_begin
|
| 1600 |
+
preceding_text = raw_text[:metadata_begin]
|
| 1601 |
+
last_period = preceding_text.rfind('.')
|
| 1602 |
+
last_comma = preceding_text.rfind(',')
|
| 1603 |
+
|
| 1604 |
+
boundary_idx = max(last_period, last_comma)
|
| 1605 |
+
|
| 1606 |
+
if boundary_idx != -1:
|
| 1607 |
+
# Extract everything from the beginning up-to-and-including the separator
|
| 1608 |
+
# This excludes the entire segment that contains the year/title
|
| 1609 |
+
segment = raw_text[0:boundary_idx + 1].strip()
|
| 1610 |
+
else:
|
| 1611 |
+
# If no separator found (e.g. metadata is in the first sentence),
|
| 1612 |
+
# cut precisely at the start of the metadata
|
| 1613 |
+
segment = raw_text[0:metadata_begin].strip()
|
| 1614 |
+
|
| 1615 |
+
# Clean up trailing punctuation (e.g. "Author, Author.")
|
| 1616 |
+
segment = segment.rstrip(".,:; ")
|
| 1617 |
+
|
| 1618 |
+
return segment
|
| 1619 |
+
|
| 1620 |
+
def identify_author_pattern(author_strings):
|
| 1621 |
+
"""
|
| 1622 |
+
Analyzes a list of author strings (top 10) to identify the naming pattern.
|
| 1623 |
+
Returns: (name_order, separator)
|
| 1624 |
+
"""
|
| 1625 |
+
if not author_strings:
|
| 1626 |
+
return "first_last", ","
|
| 1627 |
+
|
| 1628 |
+
# 1. Determine the Divider (Separator)
|
| 1629 |
+
# Rule: Sum total semicolons across all strings. If >= 5, use semicolon.
|
| 1630 |
+
total_semicolons = sum(s.count(";") for s in author_strings)
|
| 1631 |
+
total_commas = sum(s.count(",") for s in author_strings)
|
| 1632 |
+
main_sep = ";" if total_semicolons > (total_commas // 2) else ","
|
| 1633 |
+
|
| 1634 |
+
# 2. Analyze Name Order (First Last vs Last, First)
|
| 1635 |
+
order = None
|
| 1636 |
+
|
| 1637 |
+
if main_sep == ";":
|
| 1638 |
+
# If using semicolon, we check if many segments HAVE a comma inside
|
| 1639 |
+
internal_comma_count = 0
|
| 1640 |
+
total_parts = 0
|
| 1641 |
+
for s in author_strings:
|
| 1642 |
+
# Replace "and" with our sep for logic test
|
| 1643 |
+
s_clean = re.sub(r'\s+(?:and|&)\s+', '; ', s, flags=re.IGNORECASE)
|
| 1644 |
+
parts = [p.strip() for p in s_clean.split(';') if p.strip()]
|
| 1645 |
+
for p in parts:
|
| 1646 |
+
total_parts += 1
|
| 1647 |
+
if "," in p: internal_comma_count += 1
|
| 1648 |
+
|
| 1649 |
+
if total_parts > 0 and internal_comma_count >= (total_parts * 0.5):
|
| 1650 |
+
order = "last_first"
|
| 1651 |
+
else:
|
| 1652 |
+
order = "first_last"
|
| 1653 |
+
else:
|
| 1654 |
+
# main_sep is ","
|
| 1655 |
+
# Logic: If chunks are mostly single words (after replacing 'and' with comma), it's Last, First
|
| 1656 |
+
single_word_parts = 0
|
| 1657 |
+
total_parts = 0
|
| 1658 |
+
for s in author_strings:
|
| 1659 |
+
# Normalize 'and' to comma for the heuristic
|
| 1660 |
+
s_clean = re.sub(r'\s+(?:and|&)\s+', ', ', s, flags=re.IGNORECASE)
|
| 1661 |
+
parts = [p.strip() for p in s_clean.split(",") if p.strip()]
|
| 1662 |
+
for p in parts:
|
| 1663 |
+
total_parts += 1
|
| 1664 |
+
if len(p.split(" ")) == 1:
|
| 1665 |
+
single_word_parts += 1
|
| 1666 |
+
|
| 1667 |
+
if total_parts > 0 and single_word_parts >= (total_parts * 0.7):
|
| 1668 |
+
order = "last_first"
|
| 1669 |
+
else:
|
| 1670 |
+
order = "first_last"
|
| 1671 |
+
|
| 1672 |
+
if order is None:
|
| 1673 |
+
order = "first_last" # Final fallback if both heuristics fail
|
| 1674 |
+
|
| 1675 |
+
return order, main_sep
|
| 1676 |
+
|
| 1677 |
+
def parse_names_by_pattern(author_string, order, separator):
|
| 1678 |
+
"""
|
| 1679 |
+
Robustly parses author string using a global pattern and divider.
|
| 1680 |
+
"""
|
| 1681 |
+
if not author_string:
|
| 1682 |
+
return []
|
| 1683 |
+
|
| 1684 |
+
author_string = re.sub(r'\b(et\s*al\.?|etal)\b', '', author_string, flags=re.IGNORECASE)
|
| 1685 |
+
s = re.sub(r'\b(?:and|&)\b', separator, author_string, flags=re.IGNORECASE)
|
| 1686 |
+
sep_esc = re.escape(separator)
|
| 1687 |
+
# This regex collapses multiple separators and any whitespace/separators between them
|
| 1688 |
+
s = re.sub(sep_esc + r'[\s' + sep_esc + r']*' + sep_esc, separator, s)
|
| 1689 |
+
# Remove leading/trailing dividers
|
| 1690 |
+
s = s.strip().strip(separator).strip()
|
| 1691 |
+
|
| 1692 |
+
# 3. Split by the divider
|
| 1693 |
+
segments = [p.strip() for p in s.split(separator) if p.strip()]
|
| 1694 |
+
|
| 1695 |
+
# 4. Regroup based on logic
|
| 1696 |
+
raw_names = []
|
| 1697 |
+
if order == "last_first" and separator == ",":
|
| 1698 |
+
# Comma divider with Last, First order: join every two segments to get a name
|
| 1699 |
+
i = 0
|
| 1700 |
+
while i < len(segments):
|
| 1701 |
+
p1 = segments[i]
|
| 1702 |
+
if i + 1 < len(segments):
|
| 1703 |
+
p2 = segments[i+1]
|
| 1704 |
+
raw_names.append(f"{p1}, {p2}")
|
| 1705 |
+
i += 2
|
| 1706 |
+
else:
|
| 1707 |
+
raw_names.append(p1)
|
| 1708 |
+
i += 1
|
| 1709 |
+
else:
|
| 1710 |
+
# For first_last OR semicolon separator: each segment is treated as a full name
|
| 1711 |
+
raw_names = segments
|
| 1712 |
+
|
| 1713 |
+
# 5. Final normalization to standardized format (using PDF-specific logic)
|
| 1714 |
+
authors = []
|
| 1715 |
+
for name in raw_names:
|
| 1716 |
+
norm = normalize_d_author(name)
|
| 1717 |
+
if norm:
|
| 1718 |
+
authors.append(norm)
|
| 1719 |
+
|
| 1720 |
+
return authors
|
| 1721 |
+
|
| 1722 |
+
def format_verifications_display(citations):
|
| 1723 |
+
"""Format citations with verification status badges."""
|
| 1724 |
+
|
| 1725 |
+
if not citations:
|
| 1726 |
+
return "<p>No citations extracted yet.</p>"
|
| 1727 |
+
|
| 1728 |
+
|
| 1729 |
+
html_parts = ["<div style='font-family: monospace; font-size: 14px; background-color: #ffffff; padding: 15px; border-radius: 5px;'>"]
|
| 1730 |
+
|
| 1731 |
+
for i, cit in enumerate(citations, 1):
|
| 1732 |
+
verification = cit.get('verification', {})
|
| 1733 |
+
|
| 1734 |
+
import html as html_lib
|
| 1735 |
+
raw_text = cit.get('raw_text', 'No citation text')
|
| 1736 |
+
safe_raw = html_lib.escape(raw_text)
|
| 1737 |
+
|
| 1738 |
+
html_parts.append(f"<div style='margin-bottom: 20px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>")
|
| 1739 |
+
html_parts.append(f"<div><strong>[{i}]</strong> {safe_raw}</div>")
|
| 1740 |
+
|
| 1741 |
+
# Add verification status badge
|
| 1742 |
+
verification = cit.get('verification', {})
|
| 1743 |
+
|
| 1744 |
+
status = verification.get('status', 'not_verified')
|
| 1745 |
+
icon = verification.get('icon', '')
|
| 1746 |
+
|
| 1747 |
+
if status == 'verified':
|
| 1748 |
+
confidence = verification.get('confidence', 0)
|
| 1749 |
+
title_score = verification.get('title_score', 0)
|
| 1750 |
+
author_score = verification.get('author_score', 0)
|
| 1751 |
+
html_parts.append(f"<div style='margin-top: 8px; padding: 6px; background-color: #e8f5e9; border-left: 3px solid #4caf50;'>")
|
| 1752 |
+
html_parts.append(f"<strong style='color: #2e7d32;'>{icon} Verified (Confidence: {confidence:.2%})</strong>")
|
| 1753 |
+
html_parts.append(f"<br/><small>Title similarity: {title_score:.2%} | Author similarity: {author_score:.2%}</small>")
|
| 1754 |
+
html_parts.append("</div>")
|
| 1755 |
+
|
| 1756 |
+
elif status == 'ambiguous':
|
| 1757 |
+
confidence = verification.get('confidence', 0)
|
| 1758 |
+
title_score = verification.get('title_score', 0)
|
| 1759 |
+
author_score = verification.get('author_score', 0)
|
| 1760 |
+
html_parts.append(f"<div style='margin-top: 8px; padding: 6px; background-color: #fff3e0; border-left: 3px solid #ff9800;'>")
|
| 1761 |
+
html_parts.append(f"<strong style='color: #e65100;'>{icon} Ambiguous (Confidence: {confidence:.2%})</strong>")
|
| 1762 |
+
html_parts.append(f"<br/><small>Title similarity: {title_score:.2%} | Author similarity: {author_score:.2%}</small>")
|
| 1763 |
+
html_parts.append("</div>")
|
| 1764 |
+
|
| 1765 |
+
elif status == 'suspected_hallucination':
|
| 1766 |
+
confidence = verification.get('confidence', 0)
|
| 1767 |
+
title_score = verification.get('title_score', 0)
|
| 1768 |
+
author_score = verification.get('author_score', 0)
|
| 1769 |
+
html_parts.append(f"<div style='margin-top: 8px; padding: 6px; background-color: #ffebee; border-left: 3px solid #f44336;'>")
|
| 1770 |
+
html_parts.append(f"<strong style='color: #c62828;'>{icon} Suspected Hallucination (Confidence: {confidence:.2%})</strong>")
|
| 1771 |
+
html_parts.append(f"<br/><small>Title similarity: {title_score:.2%} | Author similarity: {author_score:.2%}</small>")
|
| 1772 |
+
html_parts.append("</div>")
|
| 1773 |
+
|
| 1774 |
+
elif status == 'api_error':
|
| 1775 |
+
error_msg = verification.get('error', 'Unknown error')
|
| 1776 |
+
is_no_result = error_msg == "No search results found by API"
|
| 1777 |
+
label = "Verification Note" if is_no_result else "API Error"
|
| 1778 |
+
|
| 1779 |
+
html_parts.append(f"<div style='margin-top: 8px; padding: 6px; background-color: #fafafa; border-left: 3px solid #9e9e9e;'>")
|
| 1780 |
+
html_parts.append(f"<strong style='color: #424242;'>{icon} {label}</strong><br/>")
|
| 1781 |
+
html_parts.append(f"<small>{error_msg}</small>")
|
| 1782 |
+
html_parts.append("</div>")
|
| 1783 |
+
|
| 1784 |
+
elif status == 'not_verified' or not verification:
|
| 1785 |
+
html_parts.append(f"<div style='margin-top: 8px; padding: 6px; background-color: #f5f5f5; border-left: 3px solid #bdbdbd;'>")
|
| 1786 |
+
html_parts.append(f"<strong style='color: #757575;'>Not Verified</strong>")
|
| 1787 |
+
html_parts.append("</div>")
|
| 1788 |
+
|
| 1789 |
+
html_parts.append("</div>")
|
| 1790 |
+
|
| 1791 |
+
html_parts.append("</div>")
|
| 1792 |
+
return ''.join(html_parts)
|
| 1793 |
+
|
| 1794 |
+
def export_verifications_csv(state_citations, pdf_name):
|
| 1795 |
+
"""Export citation verifications to a CSV file."""
|
| 1796 |
+
if not state_citations:
|
| 1797 |
+
return None
|
| 1798 |
+
|
| 1799 |
+
import csv
|
| 1800 |
+
|
| 1801 |
+
# Use the original PDF name for the CSV filename
|
| 1802 |
+
basename = os.path.splitext(pdf_name)[0] if pdf_name else "verifications"
|
| 1803 |
+
csv_filename = f"{basename}_verifications.csv"
|
| 1804 |
+
|
| 1805 |
+
# Create a temp directory to hold the specifically named file
|
| 1806 |
+
temp_dir = tempfile.mkdtemp()
|
| 1807 |
+
filepath = os.path.join(temp_dir, csv_filename)
|
| 1808 |
+
|
| 1809 |
+
try:
|
| 1810 |
+
with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
|
| 1811 |
+
fieldnames = [
|
| 1812 |
+
'ID', 'Status', 'Confidence', 'Title Similarity', 'Author Similarity',
|
| 1813 |
+
'Raw Citation', 'Title', 'Authors',
|
| 1814 |
+
'API Title', 'API Authors'
|
| 1815 |
+
]
|
| 1816 |
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
| 1817 |
+
writer.writeheader()
|
| 1818 |
+
|
| 1819 |
+
for i, cit in enumerate(state_citations, 1):
|
| 1820 |
+
verification = cit.get('verification', {})
|
| 1821 |
+
status = verification.get('status', 'not_verified')
|
| 1822 |
+
confidence = verification.get('confidence', 0)
|
| 1823 |
+
t_score = verification.get('title_score', 0)
|
| 1824 |
+
a_score = verification.get('author_score', 0)
|
| 1825 |
+
|
| 1826 |
+
semantic_data = verification.get('semantic_data', {})
|
| 1827 |
+
api_title = semantic_data.get('title', '') if semantic_data else ''
|
| 1828 |
+
api_authors_list = semantic_data.get('authors', []) if semantic_data else []
|
| 1829 |
+
if api_authors_list:
|
| 1830 |
+
if isinstance(api_authors_list[0], dict):
|
| 1831 |
+
api_authors = ", ".join([a.get('name', '') for a in api_authors_list if a.get('name')])
|
| 1832 |
+
else:
|
| 1833 |
+
api_authors = ", ".join([str(a) for a in api_authors_list if a])
|
| 1834 |
+
else:
|
| 1835 |
+
api_authors = ""
|
| 1836 |
+
|
| 1837 |
+
raw_text = cit.get('raw_text', '')
|
| 1838 |
+
|
| 1839 |
+
ver_title = cit.get('title_after_verification', '')
|
| 1840 |
+
ver_authors = cit.get('authors_after_verification', '')
|
| 1841 |
+
if isinstance(ver_authors, list):
|
| 1842 |
+
ver_authors = ", ".join(ver_authors)
|
| 1843 |
+
elif not isinstance(ver_authors, str):
|
| 1844 |
+
ver_authors = str(ver_authors)
|
| 1845 |
+
|
| 1846 |
+
writer.writerow({
|
| 1847 |
+
'ID': i,
|
| 1848 |
+
'Status': status,
|
| 1849 |
+
'Confidence': f"{confidence:.2%}" if status != 'not_verified' else 'N/A',
|
| 1850 |
+
'Title Similarity': f"{t_score:.2%}" if status != 'not_verified' else 'N/A',
|
| 1851 |
+
'Author Similarity': f"{a_score:.2%}" if status != 'not_verified' else 'N/A',
|
| 1852 |
+
'Raw Citation': raw_text,
|
| 1853 |
+
'Title': ver_title,
|
| 1854 |
+
'Authors': ver_authors,
|
| 1855 |
+
'API Title': api_title,
|
| 1856 |
+
'API Authors': api_authors
|
| 1857 |
+
})
|
| 1858 |
+
return filepath
|
| 1859 |
+
except Exception:
|
| 1860 |
+
return None
|
| 1861 |
+
|
| 1862 |
+
def update_view(view_mode, state_pdf_path, state_ref_pages, state_citations, state_removed_citations, state_extraction_done, state_ref_pdf_path):
|
| 1863 |
+
"""Update the view based on selected mode. Controls GROUP visibility."""
|
| 1864 |
+
|
| 1865 |
+
# OUTPUTS:
|
| 1866 |
+
# 1. view_full_pdf (Group)
|
| 1867 |
+
# 2. view_ref_pages (Group)
|
| 1868 |
+
# 3. view_citations (Group)
|
| 1869 |
+
# 4. view_verifications (Group)
|
| 1870 |
+
# 5. pdf_viewer_ref (PDF Component - Update content if Ref Pages)
|
| 1871 |
+
# 6. citations_display (HTML - Update content if Citations)
|
| 1872 |
+
# 7. corrected_display (HTML - Update content if Verifications)
|
| 1873 |
+
# 8. loading_indicator (Markdown)
|
| 1874 |
+
# 9. state_ref_pdf_path (str) -- New Cache!
|
| 1875 |
+
|
| 1876 |
+
vis_full = gr.update(visible=False)
|
| 1877 |
+
vis_ref = gr.update(visible=False)
|
| 1878 |
+
vis_cit = gr.update(visible=False)
|
| 1879 |
+
vis_ver = gr.update(visible=False)
|
| 1880 |
+
|
| 1881 |
+
upd_ref_pdf = gr.update()
|
| 1882 |
+
upd_cit_disp = gr.update()
|
| 1883 |
+
upd_ver_disp = gr.update()
|
| 1884 |
+
upd_load = gr.update(visible=False) # Default hidden
|
| 1885 |
+
|
| 1886 |
+
if not state_extraction_done and view_mode != "Show Full PDF":
|
| 1887 |
+
# Extraction in progress -> Show Loading (unless Full PDF)
|
| 1888 |
+
upd_load = gr.update(visible=True)
|
| 1889 |
+
# And keep all views hidden?
|
| 1890 |
+
return (vis_full, vis_ref, vis_cit, vis_ver, upd_ref_pdf, upd_cit_disp, upd_ver_disp, upd_load, state_ref_pdf_path)
|
| 1891 |
+
|
| 1892 |
+
if view_mode == "Show Full PDF":
|
| 1893 |
+
vis_full = gr.update(visible=True)
|
| 1894 |
+
# pdf_viewer_full should already have content from process_pdf_initial
|
| 1895 |
+
yield (vis_full, vis_ref, vis_cit, vis_ver, upd_ref_pdf, upd_cit_disp, upd_ver_disp, upd_load, state_ref_pdf_path)
|
| 1896 |
+
|
| 1897 |
+
elif view_mode == "Show Reference Pages":
|
| 1898 |
+
vis_ref = gr.update(visible=True)
|
| 1899 |
+
|
| 1900 |
+
# Check cache first
|
| 1901 |
+
if state_ref_pdf_path and os.path.exists(state_ref_pdf_path):
|
| 1902 |
+
upd_ref_pdf = gr.update()
|
| 1903 |
+
else:
|
| 1904 |
+
# Generate the Subset PDF if needed.
|
| 1905 |
+
if state_ref_pages and state_pdf_path:
|
| 1906 |
+
doc = fitz.open(state_pdf_path)
|
| 1907 |
+
new_doc = fitz.open()
|
| 1908 |
+
new_doc.insert_pdf(doc, from_page=state_ref_pages[0], to_page=state_ref_pages[-1])
|
| 1909 |
+
temp_preview = tempfile.NamedTemporaryFile(delete=False, suffix="_ref_subset.pdf")
|
| 1910 |
+
output_path = temp_preview.name
|
| 1911 |
+
temp_preview.close()
|
| 1912 |
+
new_doc.save(output_path, garbage=4, deflate=True, clean=True, expand=True)
|
| 1913 |
+
new_doc.close()
|
| 1914 |
+
doc.close()
|
| 1915 |
+
|
| 1916 |
+
state_ref_pdf_path = output_path
|
| 1917 |
+
upd_ref_pdf = gr.update(value=output_path)
|
| 1918 |
+
|
| 1919 |
+
yield (vis_full, vis_ref, vis_cit, vis_ver, upd_ref_pdf, upd_cit_disp, upd_ver_disp, upd_load, state_ref_pdf_path)
|
| 1920 |
+
|
| 1921 |
+
elif view_mode == "Show Citations":
|
| 1922 |
+
vis_cit = gr.update(visible=True)
|
| 1923 |
+
# Content is pre-filled by extract_citations_auto
|
| 1924 |
+
yield (vis_full, vis_ref, vis_cit, vis_ver, upd_ref_pdf, upd_cit_disp, upd_ver_disp, upd_load, state_ref_pdf_path)
|
| 1925 |
+
|
| 1926 |
+
elif view_mode == "Show Verifications":
|
| 1927 |
+
vis_ver = gr.update(visible=True)
|
| 1928 |
+
|
| 1929 |
+
# Always render the list. Unverified items will show "Not Verified".
|
| 1930 |
+
formatted_ver = format_verifications_display(state_citations)
|
| 1931 |
+
upd_ver_disp = gr.update(value=formatted_ver)
|
| 1932 |
+
|
| 1933 |
+
# Content is pre-filled by run_citation_check
|
| 1934 |
+
yield (vis_full, vis_ref, vis_cit, vis_ver, upd_ref_pdf, upd_cit_disp, upd_ver_disp, upd_load, state_ref_pdf_path)
|
| 1935 |
+
|
| 1936 |
+
# Build the UI
|
| 1937 |
+
with gr.Blocks(title="CiteAudit", css="""
|
| 1938 |
+
#pdf-viewer-full, #pdf-viewer-ref { min-height: 700px; }
|
| 1939 |
+
#view-citations, #view-verifications { background-color: white !important; border: none !important; box-shadow: none !important; }
|
| 1940 |
+
#citations-list, #view-verifications .gr-html { background-color: white !important; }
|
| 1941 |
+
#main-display-area { background-color: white !important; min-height: 700px; border-radius: 8px; }
|
| 1942 |
+
""") as demo:
|
| 1943 |
+
# Per-user session state
|
| 1944 |
+
state_pdf_path = gr.State(None)
|
| 1945 |
+
state_ref_pages = gr.State([])
|
| 1946 |
+
state_citations = gr.State([])
|
| 1947 |
+
state_removed_citations = gr.State([])
|
| 1948 |
+
state_appendix_header = gr.State(None)
|
| 1949 |
+
state_ref_text = gr.State("")
|
| 1950 |
+
state_extraction_done = gr.State(False)
|
| 1951 |
+
state_ref_pdf_path = gr.State(None) # Cache for Reference Pages PDF
|
| 1952 |
+
state_pdf_name = gr.State("") # Original PDF filename
|
| 1953 |
+
|
| 1954 |
+
gr.Markdown("# CiteAudit")
|
| 1955 |
+
|
| 1956 |
+
with gr.Row():
|
| 1957 |
+
with gr.Column(scale=1):
|
| 1958 |
+
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
| 1959 |
+
status_text = gr.Textbox(label="Status", interactive=False, lines=6)
|
| 1960 |
+
|
| 1961 |
+
view_toggle = gr.Radio(
|
| 1962 |
+
choices=["Show Full PDF", "Show Reference Pages", "Show Citations", "Show Verifications"],
|
| 1963 |
+
value="Show Full PDF",
|
| 1964 |
+
label="View Mode",
|
| 1965 |
+
interactive=True,
|
| 1966 |
+
visible=False
|
| 1967 |
+
)
|
| 1968 |
+
|
| 1969 |
+
verification_divider = gr.Markdown("---", visible=False)
|
| 1970 |
+
verification_header = gr.Markdown("### Citation Verification", visible=False)
|
| 1971 |
+
|
| 1972 |
+
api_key_input = gr.Textbox(
|
| 1973 |
+
label="Semantic Scholar API Key (Optional)",
|
| 1974 |
+
placeholder="Leave empty for free tier (with rate limits)",
|
| 1975 |
+
type="password",
|
| 1976 |
+
interactive=True,
|
| 1977 |
+
visible=False
|
| 1978 |
+
)
|
| 1979 |
+
|
| 1980 |
+
verify_btn = gr.Button("✅ Verify Citations", variant="secondary", interactive=False, visible=False)
|
| 1981 |
+
|
| 1982 |
+
check_count_slider = gr.Slider(
|
| 1983 |
+
minimum=1,
|
| 1984 |
+
maximum=50,
|
| 1985 |
+
value=1,
|
| 1986 |
+
step=1,
|
| 1987 |
+
label="Number of citations to check",
|
| 1988 |
+
interactive=False,
|
| 1989 |
+
visible=False
|
| 1990 |
+
)
|
| 1991 |
+
|
| 1992 |
+
export_btn = gr.Button("📊 Download Verifications (CSV)", visible=False)
|
| 1993 |
+
download_file = gr.File(label="Download CSV", visible=False)
|
| 1994 |
+
|
| 1995 |
+
gr.Markdown("<br/><small style='color: #888;'>* Automated verification may have mistakes, please check all your citations.</small>")
|
| 1996 |
+
|
| 1997 |
+
with gr.Column(scale=2, elem_id="main-display-area"):
|
| 1998 |
+
# Loading indicator
|
| 1999 |
+
loading_indicator = gr.Markdown("## ⏳ Extracting content... Please wait.", visible=False)
|
| 2000 |
+
|
| 2001 |
+
# 1. Full PDF View
|
| 2002 |
+
with gr.Group(visible=True) as view_full_pdf:
|
| 2003 |
+
pdf_viewer_full = PDF(label="Full PDF", height=700, elem_id="pdf-viewer-full")
|
| 2004 |
+
|
| 2005 |
+
# 2. Reference Pages View
|
| 2006 |
+
with gr.Group(visible=False) as view_ref_pages:
|
| 2007 |
+
pdf_viewer_ref = PDF(label="Reference Pages", height=700, elem_id="pdf-viewer-ref")
|
| 2008 |
+
|
| 2009 |
+
# 3. Citations View
|
| 2010 |
+
with gr.Group(visible=False, elem_id="view-citations") as view_citations:
|
| 2011 |
+
citations_header = gr.Markdown("### Extracted Citations")
|
| 2012 |
+
citations_display = gr.HTML(elem_id="citations-list")
|
| 2013 |
+
|
| 2014 |
+
# 4. Verifications View
|
| 2015 |
+
with gr.Group(visible=False, elem_id="view-verifications") as view_verifications:
|
| 2016 |
+
corrected_display = gr.HTML(label="Corrected Citations")
|
| 2017 |
+
|
| 2018 |
+
file_input.upload(
|
| 2019 |
+
fn=process_pdf_initial,
|
| 2020 |
+
inputs=[file_input, state_pdf_path, state_ref_pages, state_citations, state_removed_citations, state_appendix_header, state_ref_text],
|
| 2021 |
+
outputs=[file_input, status_text, pdf_viewer_full, view_toggle, citations_display, verify_btn, check_count_slider,
|
| 2022 |
+
state_pdf_path, state_ref_pages, state_citations, state_removed_citations, state_appendix_header, state_ref_text,
|
| 2023 |
+
citations_header, verification_header, verification_divider, api_key_input, state_extraction_done, corrected_display, state_ref_pdf_path, state_pdf_name, export_btn, download_file]
|
| 2024 |
+
).then(
|
| 2025 |
+
fn=extract_citations_auto,
|
| 2026 |
+
inputs=[view_toggle, status_text, state_pdf_path, state_ref_pages, state_ref_text, state_citations, state_removed_citations, state_appendix_header, state_extraction_done],
|
| 2027 |
+
outputs=[status_text, citations_display, verify_btn, check_count_slider, state_citations, state_removed_citations, state_ref_pages, state_ref_text, state_appendix_header, pdf_viewer_ref, loading_indicator, citations_header, verification_header, verification_divider, api_key_input, state_extraction_done, corrected_display, export_btn, download_file],
|
| 2028 |
+
show_progress="hidden"
|
| 2029 |
+
).then(
|
| 2030 |
+
fn=update_view,
|
| 2031 |
+
inputs=[view_toggle, state_pdf_path, state_ref_pages, state_citations, state_removed_citations, state_extraction_done, state_ref_pdf_path],
|
| 2032 |
+
outputs=[view_full_pdf, view_ref_pages, view_citations, view_verifications, pdf_viewer_ref, citations_display, corrected_display, loading_indicator, state_ref_pdf_path]
|
| 2033 |
+
)
|
| 2034 |
+
|
| 2035 |
+
verify_btn.click(
|
| 2036 |
+
fn=lambda status: (
|
| 2037 |
+
gr.update(value="Show Verifications"),
|
| 2038 |
+
status + "\n⏳ Starting verification process...",
|
| 2039 |
+
gr.update(), # Do not wipe previous content with a loading message
|
| 2040 |
+
gr.update(visible=False, value=None), # Reset download button
|
| 2041 |
+
gr.update(visible=False) # Hide export trigger button while processing
|
| 2042 |
+
),
|
| 2043 |
+
inputs=[status_text],
|
| 2044 |
+
outputs=[view_toggle, status_text, corrected_display, download_file, export_btn]
|
| 2045 |
+
).then(
|
| 2046 |
+
fn=run_citation_check,
|
| 2047 |
+
inputs=[check_count_slider, status_text, api_key_input, state_citations],
|
| 2048 |
+
outputs=[status_text, corrected_display, citations_display, state_citations],
|
| 2049 |
+
show_progress="hidden"
|
| 2050 |
+
).then(
|
| 2051 |
+
fn=lambda: gr.update(visible=True),
|
| 2052 |
+
inputs=None,
|
| 2053 |
+
outputs=[export_btn]
|
| 2054 |
+
)
|
| 2055 |
+
|
| 2056 |
+
export_btn.click(
|
| 2057 |
+
fn=export_verifications_csv,
|
| 2058 |
+
inputs=[state_citations, state_pdf_name],
|
| 2059 |
+
outputs=[download_file]
|
| 2060 |
+
).then(
|
| 2061 |
+
fn=lambda: gr.update(visible=True),
|
| 2062 |
+
inputs=None,
|
| 2063 |
+
outputs=[download_file]
|
| 2064 |
+
)
|
| 2065 |
+
|
| 2066 |
+
view_toggle.change(
|
| 2067 |
+
fn=update_view,
|
| 2068 |
+
inputs=[view_toggle, state_pdf_path, state_ref_pages, state_citations, state_removed_citations, state_extraction_done, state_ref_pdf_path],
|
| 2069 |
+
outputs=[view_full_pdf, view_ref_pages, view_citations, view_verifications, pdf_viewer_ref, citations_display, corrected_display, loading_indicator, state_ref_pdf_path],
|
| 2070 |
+
concurrency_limit=None,
|
| 2071 |
+
show_progress="hidden"
|
| 2072 |
+
)
|
| 2073 |
+
|
| 2074 |
+
if __name__ == "__main__":
|
| 2075 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PyMuPDF
|
| 2 |
+
gradio==4.31.0
|
| 3 |
+
gradio_pdf
|
| 4 |
+
pydantic==2.10.6
|
| 5 |
+
grobid-client-python
|
| 6 |
+
huggingface_hub<1.0.0
|
| 7 |
+
requests
|
| 8 |
+
python-Levenshtein
|
| 9 |
+
jellyfish
|
| 10 |
+
unidecode
|
| 11 |
+
urlextract
|
| 12 |
+
|
start.sh
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
# Start Grobid in the background
|
| 5 |
+
echo "Starting Grobid service..."
|
| 6 |
+
(cd /opt/grobid && ./grobid-service/bin/grobid-service) &
|
| 7 |
+
|
| 8 |
+
# Wait for Grobid to be ready
|
| 9 |
+
echo "Waiting for Grobid to launch on port 8070..."
|
| 10 |
+
# Using python to check connection, suppressing output to avoid noisy tracebacks
|
| 11 |
+
timeout 60s bash -c 'until python3 -c "import urllib.request; urllib.request.urlopen(\"http://localhost:8070/api/isalive\")" > /dev/null 2>&1; do echo "Waiting for Grobid..."; sleep 5; done'
|
| 12 |
+
|
| 13 |
+
echo "Grobid is alive!"
|
| 14 |
+
|
| 15 |
+
# Start the Gradio app
|
| 16 |
+
python3 app.py
|
venues.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Curated list of CS/AI publication venues
|
| 2 |
+
# VENUE_NAMES are full names and should be matched case-insensitively.
|
| 3 |
+
# VENUE_ABBREVIATIONS are short forms and MUST be matched case-sensitively.
|
| 4 |
+
|
| 5 |
+
VENUE_NAMES = [
|
| 6 |
+
"Neural Information Processing Systems",
|
| 7 |
+
"International Conference on Machine Learning",
|
| 8 |
+
"International Conference on Learning Representations",
|
| 9 |
+
"Computer Vision and Pattern Recognition",
|
| 10 |
+
"International Conference on Computer Vision",
|
| 11 |
+
"European Conference on Computer Vision",
|
| 12 |
+
"Association for the Advancement of Artificial Intelligence",
|
| 13 |
+
"International Joint Conference on Artificial Intelligence",
|
| 14 |
+
"Association for Computational Linguistics",
|
| 15 |
+
"Empirical Methods in Natural Language Processing",
|
| 16 |
+
"North American Chapter of the Association for Computational Linguistics",
|
| 17 |
+
"International Conference on Computational Linguistics",
|
| 18 |
+
"Journal of Machine Learning Research",
|
| 19 |
+
"IEEE Transactions on Pattern Analysis and Machine Intelligence",
|
| 20 |
+
"International Journal of Computer Vision",
|
| 21 |
+
"Knowledge Discovery and Data Mining",
|
| 22 |
+
"Special Interest Group on Information Retrieval",
|
| 23 |
+
"The Web Conference",
|
| 24 |
+
"Web Search and Data Mining",
|
| 25 |
+
"Conference on Robot Learning",
|
| 26 |
+
"International Conference on Robotics and Automation",
|
| 27 |
+
"International Conference on Intelligent Robots and Systems",
|
| 28 |
+
"Robotics: Science and Systems",
|
| 29 |
+
"Uncertainty in Artificial Intelligence",
|
| 30 |
+
"Artificial Intelligence and Statistics",
|
| 31 |
+
"Pattern Analysis and Machine Intelligence"
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
VENUE_ABBREVIATIONS = [
|
| 35 |
+
"NeurIPS",
|
| 36 |
+
"ICML",
|
| 37 |
+
"ICLR",
|
| 38 |
+
"CVPR",
|
| 39 |
+
"ICCV",
|
| 40 |
+
"ECCV",
|
| 41 |
+
"AAAI",
|
| 42 |
+
"IJCAI",
|
| 43 |
+
"ACL",
|
| 44 |
+
"EMNLP",
|
| 45 |
+
"NAACL",
|
| 46 |
+
"COLING",
|
| 47 |
+
"JMLR",
|
| 48 |
+
"TPAMI",
|
| 49 |
+
"IJCV",
|
| 50 |
+
"KDD",
|
| 51 |
+
"SIGIR",
|
| 52 |
+
"WWW",
|
| 53 |
+
"WSDM",
|
| 54 |
+
"CoRL",
|
| 55 |
+
"ICRA",
|
| 56 |
+
"IROS",
|
| 57 |
+
"RSS",
|
| 58 |
+
"UAI",
|
| 59 |
+
"AISTATS",
|
| 60 |
+
"NIPS",
|
| 61 |
+
"PAMI"
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
# Common terms associated with venues
|
| 65 |
+
COMMON_TERMS = [
|
| 66 |
+
"Proceedings of",
|
| 67 |
+
"In Proc.",
|
| 68 |
+
"Advances in",
|
| 69 |
+
"Workshop on",
|
| 70 |
+
"Symposium on",
|
| 71 |
+
"Conference on",
|
| 72 |
+
"Transactions on",
|
| 73 |
+
"Journal of"
|
| 74 |
+
]
|