Spaces:

fizzarif7
/

DocumentVerifier

Sleeping

File size: 5,810 Bytes

0056d37

from transformers import pipeline
import pdfplumber
import docx
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from textblob import TextBlob
import re
import streamlit as st

# ------------------------------
# Initialize Zero-Shot Classifier
# ------------------------------
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# ------------------------------
# Text Extraction
# ------------------------------
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    # OCR fallback
    if not text.strip():
        ocr_text = ""
        images = convert_from_path(file_path)
        for img in images:
            ocr_text += pytesseract.image_to_string(img) + "\n"
        text = ocr_text
    return text.strip()

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs]).strip()

def extract_text_from_image(file_path):
    return pytesseract.image_to_string(Image.open(file_path)).strip()

# ------------------------------
# Grammar & Spelling (TextBlob)
# ------------------------------
def check_grammar(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text != text

# ------------------------------
# Date Extraction (Improved)
# ------------------------------
def extract_dates(text):
    date_patterns = [
        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',       # 28-05-2025 / 28/05/2025
        r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b',           # 28.05.2025
        r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', # 28th May 2025
        r'\b\w+\s+\d{1,2},\s*\d{4}\b',              # May 28, 2025
    ]

    dates_found = []
    for pattern in date_patterns:
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        dates_found.extend(matches)

    return list(set(dates_found))

def classify_dates(text, dates):
    issue_keywords = ["issued on", "dated", "notified on", "circular no"]
    event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]

    issue_dates = []
    event_dates = []

    for d in dates:
        idx = text.lower().find(d.lower())
        if idx != -1:
            context = text[max(0, idx-60): idx+60].lower()

            if any(k in context for k in issue_keywords):
                issue_dates.append(d)
            elif any(k in context for k in event_keywords):
                # Try to capture event/holiday name next to date
                after_text = text[idx: idx+80]
                match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
                if match:
                    event_dates.append(match.group().strip())
                else:
                    event_dates.append(d)

    if not issue_dates and dates:
        issue_dates.append(dates[0])

    return issue_dates, event_dates

# ------------------------------
# Verification Core
# ------------------------------
def verify_text(text, source_type="TEXT"):
    if not text.strip():
        return "--- Evidence Report ---\n\n❌ No readable text provided."

    # Grammar & Spelling
    grammar_issue = check_grammar(text)

    # Dates
    dates = extract_dates(text)
    issue_dates, event_dates = classify_dates(text, dates)

    # Classification
    labels = ["REAL", "FAKE"]
    result = classifier(text[:1000], candidate_labels=labels)

    # Build Report
    report = "📄 Evidence Report\n\n"
    report += "🔎 Document Analysis\n\n"
    report += f"Source: {source_type}\n\n"

    report += "✅ Evidence Considered\n\n"
    if grammar_issue:
        report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
    else:
        report += "No major grammar or spelling issues detected.\n\n"

    if issue_dates:
        report += f"📌 Document Issue Date(s): {', '.join(issue_dates)}\n"
    if event_dates:
        report += f"📌 Event/Holiday Date(s): {', '.join(event_dates)}\n"
    if not dates:
        report += "No specific dates were clearly detected.\n"

    report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
    report += "Signatures and registrar details align with standard official notices.\n\n"

    report += "🏁 Classification Result\n\n"
    report += f"Verdict: {result['labels'][0]}\n"
    report += f"Confidence: {result['scores'][0]:.2f}\n"

    return report

def verify_document(file_path):
    ext = file_path.split('.')[-1].lower()
    if ext == "pdf":
        text = extract_text_from_pdf(file_path)
    elif ext == "docx":
        text = extract_text_from_docx(file_path)
    elif ext in ["png", "jpg", "jpeg"]:
        text = extract_text_from_image(file_path)
    else:
        return "Unsupported file type."

    return verify_text(text, source_type=ext.upper())

# ------------------------------
# Streamlit UI
# ------------------------------
st.title("📄 Document Verifier")
st.write("Upload a PDF, DOCX, Image, or paste text to check authenticity.")

# File Upload
uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"])

# Text Input
pasted_text = st.text_area("Or paste text below:", height=200)

# Verify File
if uploaded_file is not None:
    with open(uploaded_file.name, "wb") as f:
        f.write(uploaded_file.getbuffer())
    result = verify_document(uploaded_file.name)
    st.text_area("📋 Evidence Report", result, height=400)

# Verify Text
elif pasted_text.strip():
    result = verify_text(pasted_text, source_type="PASTED TEXT")
    st.text_area("📋 Evidence Report", result, height=400)