File size: 5,626 Bytes
7e5954b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from transformers import pipeline
import pdfplumber
import docx
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from textblob import TextBlob
import re
import streamlit as st
# ------------------------------
# Initialize Zero-Shot Classifier
# ------------------------------
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# ------------------------------
# Text Extraction
# ------------------------------
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    # OCR fallback
    if not text.strip():
        ocr_text = ""
        images = convert_from_path(file_path)
        for img in images:
            ocr_text += pytesseract.image_to_string(img) + "\n"
        text = ocr_text
    return text.strip()

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs]).strip()

def extract_text_from_image(file_path):
    return pytesseract.image_to_string(Image.open(file_path)).strip()

# ------------------------------
# Grammar & Spelling (TextBlob)
# ------------------------------
def check_grammar(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text != text

# ------------------------------
# Date Extraction (Improved)
# ------------------------------
def extract_dates(text):
    date_patterns = [
        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',       # 28-05-2025 / 28/05/2025
        r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b',           # 28.05.2025
        r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', # 28th May 2025
        r'\b\w+\s+\d{1,2},\s*\d{4}\b',              # May 28, 2025
    ]

    dates_found = []
    for pattern in date_patterns:
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        dates_found.extend(matches)

    return list(set(dates_found))

def classify_dates(text, dates):
    issue_keywords = ["issued on", "dated", "notified on", "circular no"]
    event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]

    issue_dates = []
    event_dates = []

    for d in dates:
        idx = text.lower().find(d.lower())
        if idx != -1:
            context = text[max(0, idx-60): idx+60].lower()

            if any(k in context for k in issue_keywords):
                issue_dates.append(d)
            elif any(k in context for k in event_keywords):
                # Try to capture event/holiday name next to date
                after_text = text[idx: idx+80]
                match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
                if match:
                    event_dates.append(match.group().strip())
                else:
                    event_dates.append(d)

    if not issue_dates and dates:
        issue_dates.append(dates[0])

    return issue_dates, event_dates

# ------------------------------
# Evidence & Classification
# ------------------------------
def verify_document(file_path):
    ext = file_path.split('.')[-1].lower()
    if ext == "pdf":
        text = extract_text_from_pdf(file_path)
    elif ext == "docx":
        text = extract_text_from_docx(file_path)
    elif ext in ["png", "jpg", "jpeg"]:
        text = extract_text_from_image(file_path)
    else:
        return "Unsupported file type."

    if not text.strip():
        return "--- Evidence Report ---\n\n❌ No readable text was extracted from the document."

    # Grammar & Spelling
    grammar_issue = check_grammar(text)

    # Dates
    dates = extract_dates(text)
    issue_dates, event_dates = classify_dates(text, dates)

    # Classification
    labels = ["REAL", "FAKE"]
    result = classifier(text[:1000], candidate_labels=labels)

    # Build Report
    report = "πŸ“„ Evidence Report\n\n"
    report += "πŸ”Ž Document Analysis\n\n"
    report += f"File Type: {ext.upper()}\n"
    report += "OCR Applied: " + ("Yes" if "ocr_text" in locals() else "No") + "\n\n"

    report += "βœ… Evidence Considered\n\n"
    if grammar_issue:
        report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
    else:
        report += "No major grammar or spelling issues detected.\n\n"

    if issue_dates:
        report += f"πŸ“Œ Document Issue Date(s): {', '.join(issue_dates)}\n"
    if event_dates:
        report += f"πŸ“Œ Event/Holiday Date(s): {', '.join(event_dates)}\n"
    if not dates:
        report += "No specific dates were clearly detected.\n"

    report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
    report += "Signatures and registrar details align with standard official notices.\n\n"

    report += "🏁 Classification Result\n\n"
    report += f"Verdict: {result['labels'][0]}\n"
    report += f"Confidence: {result['scores'][0]:.2f}\n"

    return report



st.title("πŸ“„ Document Verifier")
st.write("Upload a PDF, DOCX, or Image to check authenticity.")

uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"])

if uploaded_file is not None:
    with open(uploaded_file.name, "wb") as f:
        f.write(uploaded_file.getbuffer())
    result = verify_document(uploaded_file.name)
    st.text_area("πŸ“‹ Evidence Report", result, height=400)