import streamlit as st from transformers import pipeline,AutoModelForSequenceClassification, AutoTokenizer import pdfplumber import docx from PIL import Image from textblob import TextBlob import re import fitz import pytesseract # ------------------------ # Hugging Face Model tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli") model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli") classifier = pipeline( "zero-shot-classification", model=model, tokenizer=tokenizer, device=-1 ) # ------------------------ # Extraction Functions # ------------------------ def extract_text_from_pdf(file_path): text = "" with pdfplumber.open(file_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" if not text.strip(): # OCR fallback ocr_text = "" doc = fitz.open(file_path) for page_num in range(len(doc)): page = doc[page_num] pix = page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) ocr_text += pytesseract.image_to_string(img) + "\n" text = ocr_text return text.strip() def extract_text_from_docx(file_path): doc = docx.Document(file_path) return "\n".join([p.text for p in doc.paragraphs]).strip() def extract_text_from_image(file_path): return pytesseract.image_to_string(Image.open(file_path)).strip() def check_grammar(text): blob = TextBlob(text) corrected_text = str(blob.correct()) return corrected_text != text def extract_dates(text): date_patterns = [ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b', r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', r'\b\w+\s+\d{1,2},\s*\d{4}\b', ] dates_found = [] for pattern in date_patterns: matches = re.findall(pattern, text, flags=re.IGNORECASE) dates_found.extend(matches) return list(set(dates_found)) def classify_dates(text, dates): issue_keywords = ["issued on", "dated", "notified on", "circular no"] event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"] issue_dates, event_dates = [], [] for d in dates: idx = text.lower().find(d.lower()) if idx != -1: context = text[max(0, idx-60): idx+60].lower() if any(k in context for k in issue_keywords): issue_dates.append(d) elif any(k in context for k in event_keywords): after_text = text[idx: idx+80] match = re.search(rf"{re.escape(d)}[^\n]*", after_text) event_dates.append(match.group().strip() if match else d) if not issue_dates and dates: issue_dates.append(dates[0]) return issue_dates, event_dates # ------------------------ # Verification Logic # ------------------------ def verify_text(text, source_type="TEXT"): if not text.strip(): return "--- Evidence Report ---\n\nāŒ No readable text provided." # ------------------------ # Heuristic Checks # ------------------------ grammar_issue = check_grammar(text) dates = extract_dates(text) issue_dates, event_dates = classify_dates(text, dates) # Scam / fake indicators scam_keywords = [ "bank details", "send money", "lottery", "win prize", "transfer fee", "urgent", "click here", "claim", "scholarship $" ] scam_detected = any(kw in text.lower() for kw in scam_keywords) # Date consistency check contradiction = False if issue_dates and event_dates: try: from datetime import datetime fmt_variants = ["%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y", "%d %B %Y", "%B %d, %Y"] def parse_date(d): for fmt in fmt_variants: try: return datetime.strptime(d, fmt) except Exception: continue return None parsed_issue = parse_date(issue_dates[0]) parsed_event = parse_date(event_dates[0]) if parsed_issue and parsed_event and parsed_event < parsed_issue: contradiction = True except Exception: pass # ------------------------ # Hugging Face Model # ------------------------ labels = ["REAL", "FAKE"] result = classifier(text[:1000], candidate_labels=labels) model_label = result['labels'][0] model_confidence = result['scores'][0] # ------------------------ # Final Verdict Logic # ------------------------ final_label = model_label if scam_detected or contradiction or grammar_issue: # downgrade to FAKE if red flags appear final_label = "FAKE" # ------------------------ # Report # ------------------------ report = "šŸ“„ Evidence Report\n\n" report += "šŸ”Ž Document Analysis\n\n" report += f"Source: {source_type}\n\n" report += "āœ… Evidence Considered\n\n" if grammar_issue: report += "āš ļø Grammar/Spelling issues detected.\n" else: report += "No grammar issues detected.\n" if issue_dates: report += f"šŸ“Œ Issue Date(s): {', '.join(issue_dates)}\n" if event_dates: report += f"šŸ“Œ Event Date(s): {', '.join(event_dates)}\n" if not dates: report += "No specific dates detected.\n" if contradiction: report += "āš ļø Date inconsistency detected (event before issue date).\n" if scam_detected: report += "āš ļø Scam-related keywords detected.\n" report += "\nFormatting and tone analyzed.\n\n" report += "šŸ Classification Result\n\n" report += f"Model Verdict: {model_label} ({model_confidence:.2f})\n" report += f"Final Verdict: {final_label}\n" return report import tempfile import os def verify_document(file): if file is None: return "āŒ Please upload a file or provide a file path." # Case 1: If input is a string (direct file path) if isinstance(file, str): file_path = file # Case 2: If input is an uploaded file (Streamlit/Colab) else: # Save to a temporary file suffix = os.path.splitext(file.name)[-1] with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: tmp.write(file.read()) file_path = tmp.name # Detect file type and extract ext = file_path.split('.')[-1].lower() if ext == "pdf": text = extract_text_from_pdf(file_path) elif ext == "docx": text = extract_text_from_docx(file_path) elif ext in ["png", "jpg", "jpeg"]: text = extract_text_from_image(file_path) else: return "āŒ Unsupported file type." return verify_text(text, source_type=ext.upper()) def process_input(file, manual_text): if file is not None: return verify_document(file) elif manual_text.strip(): return verify_text(manual_text, source_type="MANUAL TEXT") else: return "āŒ Please upload a document or paste text first." # ------------------------ # Streamlit UI # ------------------------ # ------------------------ # Streamlit UI # ------------------------ st.set_page_config(page_title="Document Verifier", layout="centered") st.title("šŸ“‘ Document Authenticity Verifier") uploaded_file = st.file_uploader( "Upload a document (PDF, DOCX, PNG, JPG)", type=["pdf", "docx", "png", "jpg", "jpeg"] ) manual_text = st.text_area("Or paste text manually") # Button for uploaded files if st.button("Verify Uploaded Document"): with st.spinner("Analyzing uploaded document..."): result = process_input(uploaded_file, "") st.text_area("Evidence Report", value=result, height=400) # Button for manual text if st.button("Verify Manual Text"): with st.spinner("Analyzing manual text..."): result = process_input(None, manual_text) st.text_area("Evidence Report", value=result, height=400)