Spaces:

fizzarif7
/

docVer

Sleeping

App Files Files Community

fizzarif7 commited on Sep 1, 2025

Commit

e508a24

verified ·

1 Parent(s): b5f9b9a

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -52

app.py CHANGED Viewed

@@ -1,23 +1,33 @@
-import os
-os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
-os.environ["STREAMLIT_SERVER_PORT"] = os.environ.get("PORT", "7860")
-os.environ["STREAMLIT_SERVER_ADDRESS"] = "0.0.0.0"
 import streamlit as st
-from transformers import pipeline
 import pdfplumber
 import docx
 from PIL import Image
-import pytesseract
 from textblob import TextBlob
 import re
-import fitz  # ✅ PyMuPDF instead of pdf2image
-import os
 # ------------------------
 # Hugging Face Model
-# ------------------------
-classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 # ------------------------
 # Extraction Functions
@@ -30,8 +40,7 @@ def extract_text_from_pdf(file_path):
             if page_text:
                 text += page_text + "\n"
-    # OCR fallback if no text extracted
-    if not text.strip():
         ocr_text = ""
         doc = fitz.open(file_path)
         for page_num in range(len(doc)):
@@ -71,9 +80,7 @@ def classify_dates(text, dates):
     issue_keywords = ["issued on", "dated", "notified on", "circular no"]
     event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
-    issue_dates = []
-    event_dates = []
     for d in dates:
         idx = text.lower().find(d.lower())
         if idx != -1:
@@ -83,14 +90,10 @@ def classify_dates(text, dates):
             elif any(k in context for k in event_keywords):
                 after_text = text[idx: idx+80]
                 match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
-                if match:
-                    event_dates.append(match.group().strip())
-                else:
-                    event_dates.append(d)
     if not issue_dates and dates:
         issue_dates.append(dates[0])
     return issue_dates, event_dates
 # ------------------------
@@ -100,41 +103,110 @@ def verify_text(text, source_type="TEXT"):
     if not text.strip():
         return "--- Evidence Report ---\n\n❌ No readable text provided."
     grammar_issue = check_grammar(text)
     dates = extract_dates(text)
     issue_dates, event_dates = classify_dates(text, dates)
     labels = ["REAL", "FAKE"]
     result = classifier(text[:1000], candidate_labels=labels)
     report = "📄 Evidence Report\n\n"
     report += "🔎 Document Analysis\n\n"
     report += f"Source: {source_type}\n\n"
     report += "✅ Evidence Considered\n\n"
     if grammar_issue:
-        report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
     else:
-        report += "No major grammar or spelling issues detected.\n\n"
-    if issue_dates:
-        report += f"📌 Document Issue Date(s): {', '.join(issue_dates)}\n"
-    if event_dates:
-        report += f"📌 Event/Holiday Date(s): {', '.join(event_dates)}\n"
-    if not dates:
-        report += "No specific dates were clearly detected.\n"
-    report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
-    report += "Signatures and registrar details align with standard official notices.\n\n"
     report += "🏁 Classification Result\n\n"
-    report += f"Verdict: {result['labels'][0]}\n"
-    report += f"Confidence: {result['scores'][0]:.2f}\n"
     return report
 def verify_document(file):
-    file_path = file.name
     ext = file_path.split('.')[-1].lower()
     if ext == "pdf":
         text = extract_text_from_pdf(file_path)
@@ -143,31 +215,44 @@ def verify_document(file):
     elif ext in ["png", "jpg", "jpeg"]:
         text = extract_text_from_image(file_path)
     else:
-        return "Unsupported file type."
     return verify_text(text, source_type=ext.upper())
 # ------------------------
 # Streamlit UI
 # ------------------------
-st.set_page_config(page_title="📑 Document Authenticity Verifier", layout="wide")
 st.title("📑 Document Authenticity Verifier")
-st.write("Upload a **PDF, DOCX, or Image**, OR paste raw **text** to verify authenticity.")
-# File uploader
-uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "png", "jpg", "jpeg"])
-# Text input
-manual_text = st.text_area("Or paste the notification text here:")
-if st.button("Verify Document"):
-    if uploaded_file is not None:
-        with open(uploaded_file.name, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        report = verify_document(uploaded_file)
-        st.text_area("Verification Report", report, height=400)
-    elif manual_text.strip():
-        report = verify_text(manual_text, source_type="MANUAL TEXT")
-        st.text_area("Verification Report", report, height=400)
-    else:
-        st.warning("Please upload a document or paste text first.")

 import streamlit as st
+from transformers import pipeline,AutoModelForSequenceClassification, AutoTokenizer
 import pdfplumber
 import docx
 from PIL import Image
 from textblob import TextBlob
 import re
+import fitz
+import pytesseract
+pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
 # ------------------------
 # Hugging Face Model
+tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
+model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
+classifier = pipeline(
+    "zero-shot-classification",
+    model=model,
+    tokenizer=tokenizer,
+    device=-1
+)
 # ------------------------
 # Extraction Functions
             if page_text:
                 text += page_text + "\n"
+    if not text.strip():  # OCR fallback
         ocr_text = ""
         doc = fitz.open(file_path)
         for page_num in range(len(doc)):
     issue_keywords = ["issued on", "dated", "notified on", "circular no"]
     event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
+    issue_dates, event_dates = [], []
     for d in dates:
         idx = text.lower().find(d.lower())
         if idx != -1:
             elif any(k in context for k in event_keywords):
                 after_text = text[idx: idx+80]
                 match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
+                event_dates.append(match.group().strip() if match else d)
     if not issue_dates and dates:
         issue_dates.append(dates[0])
     return issue_dates, event_dates
 # ------------------------
     if not text.strip():
         return "--- Evidence Report ---\n\n❌ No readable text provided."
+    # ------------------------
+    # Heuristic Checks
+    # ------------------------
     grammar_issue = check_grammar(text)
     dates = extract_dates(text)
     issue_dates, event_dates = classify_dates(text, dates)
+    # Scam / fake indicators
+    scam_keywords = [
+        "bank details", "send money", "lottery", "win prize",
+        "transfer fee", "urgent", "click here", "claim", "scholarship $"
+    ]
+    scam_detected = any(kw in text.lower() for kw in scam_keywords)
+    # Date consistency check
+    contradiction = False
+    if issue_dates and event_dates:
+        try:
+            from datetime import datetime
+            fmt_variants = ["%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y", "%d %B %Y", "%B %d, %Y"]
+            def parse_date(d):
+                for fmt in fmt_variants:
+                    try:
+                        return datetime.strptime(d, fmt)
+                    except Exception:
+                        continue
+                return None
+            parsed_issue = parse_date(issue_dates[0])
+            parsed_event = parse_date(event_dates[0])
+            if parsed_issue and parsed_event and parsed_event < parsed_issue:
+                contradiction = True
+        except Exception:
+            pass
+    # ------------------------
+    # Hugging Face Model
+    # ------------------------
     labels = ["REAL", "FAKE"]
     result = classifier(text[:1000], candidate_labels=labels)
+    model_label = result['labels'][0]
+    model_confidence = result['scores'][0]
+    # ------------------------
+    # Final Verdict Logic
+    # ------------------------
+    final_label = model_label
+    if scam_detected or contradiction or grammar_issue:
+        # downgrade to FAKE if red flags appear
+        final_label = "FAKE"
+    # ------------------------
+    # Report
+    # ------------------------
     report = "📄 Evidence Report\n\n"
     report += "🔎 Document Analysis\n\n"
     report += f"Source: {source_type}\n\n"
     report += "✅ Evidence Considered\n\n"
     if grammar_issue:
+        report += "⚠️ Grammar/Spelling issues detected.\n"
     else:
+        report += "No grammar issues detected.\n"
+    if issue_dates:
+        report += f"📌 Issue Date(s): {', '.join(issue_dates)}\n"
+    if event_dates:
+        report += f"📌 Event Date(s): {', '.join(event_dates)}\n"
+    if not dates:
+        report += "No specific dates detected.\n"
+    if contradiction:
+        report += "⚠️ Date inconsistency detected (event before issue date).\n"
+    if scam_detected:
+        report += "⚠️ Scam-related keywords detected.\n"
+    report += "\nFormatting and tone analyzed.\n\n"
     report += "🏁 Classification Result\n\n"
+    report += f"Model Verdict: {model_label} ({model_confidence:.2f})\n"
+    report += f"Final Verdict: {final_label}\n"
     return report
+import tempfile
+import os
 def verify_document(file):
+    if file is None:
+        return "❌ Please upload a file or provide a file path."
+    # Case 1: If input is a string (direct file path)
+    if isinstance(file, str):
+        file_path = file
+    # Case 2: If input is an uploaded file (Streamlit/Colab)
+    else:
+        # Save to a temporary file
+        suffix = os.path.splitext(file.name)[-1]
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            tmp.write(file.read())
+            file_path = tmp.name
+    # Detect file type and extract
     ext = file_path.split('.')[-1].lower()
     if ext == "pdf":
         text = extract_text_from_pdf(file_path)
     elif ext in ["png", "jpg", "jpeg"]:
         text = extract_text_from_image(file_path)
     else:
+        return "❌ Unsupported file type."
     return verify_text(text, source_type=ext.upper())
+def process_input(file, manual_text):
+    if file is not None:
+        return verify_document(file)
+    elif manual_text.strip():
+        return verify_text(manual_text, source_type="MANUAL TEXT")
+    else:
+        return "❌ Please upload a document or paste text first."
 # ------------------------
 # Streamlit UI
 # ------------------------
+# ------------------------
+# Streamlit UI
+# ------------------------
+st.set_page_config(page_title="Document Verifier", layout="centered")
 st.title("📑 Document Authenticity Verifier")
+uploaded_file = st.file_uploader(
+    "Upload a document (PDF, DOCX, PNG, JPG)",
+    type=["pdf", "docx", "png", "jpg", "jpeg"]
+)
+manual_text = st.text_area("Or paste text manually")
+# Button for uploaded files
+if st.button("Verify Uploaded Document"):
+    with st.spinner("Analyzing uploaded document..."):
+        result = process_input(uploaded_file, "")
+    st.text_area("Evidence Report", value=result, height=400)
+# Button for manual text
+if st.button("Verify Manual Text"):
+    with st.spinner("Analyzing manual text..."):
+        result = process_input(None, manual_text)
+    st.text_area("Evidence Report", value=result, height=400)