Spaces:
Runtime error
Runtime error
| from transformers import pipeline | |
| import pdfplumber | |
| import docx | |
| from PIL import Image | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from textblob import TextBlob | |
| import re | |
| import streamlit as st | |
| # ------------------------------ | |
| # Initialize Zero-Shot Classifier | |
| # ------------------------------ | |
| classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| # ------------------------------ | |
| # Text Extraction | |
| # ------------------------------ | |
| def extract_text_from_pdf(file_path): | |
| text = "" | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| # OCR fallback | |
| if not text.strip(): | |
| ocr_text = "" | |
| images = convert_from_path(file_path) | |
| for img in images: | |
| ocr_text += pytesseract.image_to_string(img) + "\n" | |
| text = ocr_text | |
| return text.strip() | |
| def extract_text_from_docx(file_path): | |
| doc = docx.Document(file_path) | |
| return "\n".join([p.text for p in doc.paragraphs]).strip() | |
| def extract_text_from_image(file_path): | |
| return pytesseract.image_to_string(Image.open(file_path)).strip() | |
| # ------------------------------ | |
| # Grammar & Spelling (TextBlob) | |
| # ------------------------------ | |
| def check_grammar(text): | |
| blob = TextBlob(text) | |
| corrected_text = str(blob.correct()) | |
| return corrected_text != text | |
| # ------------------------------ | |
| # Date Extraction (Improved) | |
| # ------------------------------ | |
| def extract_dates(text): | |
| date_patterns = [ | |
| r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # 28-05-2025 / 28/05/2025 | |
| r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b', # 28.05.2025 | |
| r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', # 28th May 2025 | |
| r'\b\w+\s+\d{1,2},\s*\d{4}\b', # May 28, 2025 | |
| ] | |
| dates_found = [] | |
| for pattern in date_patterns: | |
| matches = re.findall(pattern, text, flags=re.IGNORECASE) | |
| dates_found.extend(matches) | |
| return list(set(dates_found)) | |
| def classify_dates(text, dates): | |
| issue_keywords = ["issued on", "dated", "notified on", "circular no"] | |
| event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"] | |
| issue_dates = [] | |
| event_dates = [] | |
| for d in dates: | |
| idx = text.lower().find(d.lower()) | |
| if idx != -1: | |
| context = text[max(0, idx-60): idx+60].lower() | |
| if any(k in context for k in issue_keywords): | |
| issue_dates.append(d) | |
| elif any(k in context for k in event_keywords): | |
| # Try to capture event/holiday name next to date | |
| after_text = text[idx: idx+80] | |
| match = re.search(rf"{re.escape(d)}[^\n]*", after_text) | |
| if match: | |
| event_dates.append(match.group().strip()) | |
| else: | |
| event_dates.append(d) | |
| if not issue_dates and dates: | |
| issue_dates.append(dates[0]) | |
| return issue_dates, event_dates | |
| # ------------------------------ | |
| # Evidence & Classification | |
| # ------------------------------ | |
| def verify_document(file_path): | |
| ext = file_path.split('.')[-1].lower() | |
| if ext == "pdf": | |
| text = extract_text_from_pdf(file_path) | |
| elif ext == "docx": | |
| text = extract_text_from_docx(file_path) | |
| elif ext in ["png", "jpg", "jpeg"]: | |
| text = extract_text_from_image(file_path) | |
| else: | |
| return "Unsupported file type." | |
| if not text.strip(): | |
| return "--- Evidence Report ---\n\nβ No readable text was extracted from the document." | |
| # Grammar & Spelling | |
| grammar_issue = check_grammar(text) | |
| # Dates | |
| dates = extract_dates(text) | |
| issue_dates, event_dates = classify_dates(text, dates) | |
| # Classification | |
| labels = ["REAL", "FAKE"] | |
| result = classifier(text[:1000], candidate_labels=labels) | |
| # Build Report | |
| report = "π Evidence Report\n\n" | |
| report += "π Document Analysis\n\n" | |
| report += f"File Type: {ext.upper()}\n" | |
| report += "OCR Applied: " + ("Yes" if "ocr_text" in locals() else "No") + "\n\n" | |
| report += "β Evidence Considered\n\n" | |
| if grammar_issue: | |
| report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n" | |
| else: | |
| report += "No major grammar or spelling issues detected.\n\n" | |
| if issue_dates: | |
| report += f"π Document Issue Date(s): {', '.join(issue_dates)}\n" | |
| if event_dates: | |
| report += f"π Event/Holiday Date(s): {', '.join(event_dates)}\n" | |
| if not dates: | |
| report += "No specific dates were clearly detected.\n" | |
| report += "\nDocument formatting and official tone resemble genuine university circulars.\n" | |
| report += "Signatures and registrar details align with standard official notices.\n\n" | |
| report += "π Classification Result\n\n" | |
| report += f"Verdict: {result['labels'][0]}\n" | |
| report += f"Confidence: {result['scores'][0]:.2f}\n" | |
| return report | |
| st.title("π Document Verifier") | |
| st.write("Upload a PDF, DOCX, or Image to check authenticity.") | |
| uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"]) | |
| if uploaded_file is not None: | |
| with open(uploaded_file.name, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| result = verify_document(uploaded_file.name) | |
| st.text_area("π Evidence Report", result, height=400) |