from transformers import pipeline import pdfplumber import docx from PIL import Image import pytesseract from pdf2image import convert_from_path from textblob import TextBlob import re import streamlit as st # ------------------------------ # Initialize Zero-Shot Classifier # ------------------------------ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") # ------------------------------ # Text Extraction # ------------------------------ def extract_text_from_pdf(file_path): text = "" with pdfplumber.open(file_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" # OCR fallback if not text.strip(): ocr_text = "" images = convert_from_path(file_path) for img in images: ocr_text += pytesseract.image_to_string(img) + "\n" text = ocr_text return text.strip() def extract_text_from_docx(file_path): doc = docx.Document(file_path) return "\n".join([p.text for p in doc.paragraphs]).strip() def extract_text_from_image(file_path): return pytesseract.image_to_string(Image.open(file_path)).strip() # ------------------------------ # Grammar & Spelling (TextBlob) # ------------------------------ def check_grammar(text): blob = TextBlob(text) corrected_text = str(blob.correct()) return corrected_text != text # ------------------------------ # Date Extraction (Improved) # ------------------------------ def extract_dates(text): date_patterns = [ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # 28-05-2025 / 28/05/2025 r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b', # 28.05.2025 r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', # 28th May 2025 r'\b\w+\s+\d{1,2},\s*\d{4}\b', # May 28, 2025 ] dates_found = [] for pattern in date_patterns: matches = re.findall(pattern, text, flags=re.IGNORECASE) dates_found.extend(matches) return list(set(dates_found)) def classify_dates(text, dates): issue_keywords = ["issued on", "dated", "notified on", "circular no"] event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"] issue_dates = [] event_dates = [] for d in dates: idx = text.lower().find(d.lower()) if idx != -1: context = text[max(0, idx-60): idx+60].lower() if any(k in context for k in issue_keywords): issue_dates.append(d) elif any(k in context for k in event_keywords): # Try to capture event/holiday name next to date after_text = text[idx: idx+80] match = re.search(rf"{re.escape(d)}[^\n]*", after_text) if match: event_dates.append(match.group().strip()) else: event_dates.append(d) if not issue_dates and dates: issue_dates.append(dates[0]) return issue_dates, event_dates # ------------------------------ # Verification Core # ------------------------------ def verify_text(text, source_type="TEXT"): if not text.strip(): return "--- Evidence Report ---\n\nāŒ No readable text provided." # Grammar & Spelling grammar_issue = check_grammar(text) # Dates dates = extract_dates(text) issue_dates, event_dates = classify_dates(text, dates) # Classification labels = ["REAL", "FAKE"] result = classifier(text[:1000], candidate_labels=labels) # Build Report report = "šŸ“„ Evidence Report\n\n" report += "šŸ”Ž Document Analysis\n\n" report += f"Source: {source_type}\n\n" report += "āœ… Evidence Considered\n\n" if grammar_issue: report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n" else: report += "No major grammar or spelling issues detected.\n\n" if issue_dates: report += f"šŸ“Œ Document Issue Date(s): {', '.join(issue_dates)}\n" if event_dates: report += f"šŸ“Œ Event/Holiday Date(s): {', '.join(event_dates)}\n" if not dates: report += "No specific dates were clearly detected.\n" report += "\nDocument formatting and official tone resemble genuine university circulars.\n" report += "Signatures and registrar details align with standard official notices.\n\n" report += "šŸ Classification Result\n\n" report += f"Verdict: {result['labels'][0]}\n" report += f"Confidence: {result['scores'][0]:.2f}\n" return report def verify_document(file_path): ext = file_path.split('.')[-1].lower() if ext == "pdf": text = extract_text_from_pdf(file_path) elif ext == "docx": text = extract_text_from_docx(file_path) elif ext in ["png", "jpg", "jpeg"]: text = extract_text_from_image(file_path) else: return "Unsupported file type." return verify_text(text, source_type=ext.upper()) # ------------------------------ # Streamlit UI # ------------------------------ st.title("šŸ“„ Document Verifier") st.write("Upload a PDF, DOCX, Image, or paste text to check authenticity.") # File Upload uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"]) # Text Input pasted_text = st.text_area("Or paste text below:", height=200) # Verify File if uploaded_file is not None: with open(uploaded_file.name, "wb") as f: f.write(uploaded_file.getbuffer()) result = verify_document(uploaded_file.name) st.text_area("šŸ“‹ Evidence Report", result, height=400) # Verify Text elif pasted_text.strip(): result = verify_text(pasted_text, source_type="PASTED TEXT") st.text_area("šŸ“‹ Evidence Report", result, height=400)