File size: 8,120 Bytes
3357c31
a33168f
e508a24
a33168f
 
 
e508a24
a33168f
 
e508a24
 
1b9165b
e508a24
a33168f
 
 
e508a24
 
 
 
 
 
 
 
 
 
 
 
 
 
a33168f
 
 
 
 
 
 
 
 
 
 
 
e508a24
a33168f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e508a24
a33168f
 
 
 
 
 
 
 
 
e508a24
a33168f
 
 
 
 
 
 
 
 
 
 
 
e508a24
 
 
a33168f
 
 
 
e508a24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a33168f
 
e508a24
 
a33168f
e508a24
 
 
 
 
 
 
 
 
 
 
a33168f
 
 
 
 
 
e508a24
a33168f
e508a24
a33168f
e508a24
 
 
 
 
 
a33168f
e508a24
 
 
 
a33168f
e508a24
a33168f
e508a24
 
a33168f
 
 
e508a24
 
 
a33168f
e508a24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a33168f
 
 
 
 
 
 
 
e508a24
 
a33168f
 
e508a24
 
 
 
 
 
 
 
 
 
a33168f
 
 
e508a24
 
 
 
a33168f
 
e508a24
 
 
 
 
a33168f
e508a24
 
 
 
 
 
 
 
 
 
 
67e1abc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259

import streamlit as st
from transformers import pipeline,AutoModelForSequenceClassification, AutoTokenizer
import pdfplumber
import docx
from PIL import Image

from textblob import TextBlob
import re
import fitz
import pytesseract



# ------------------------
# Hugging Face Model




tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

classifier = pipeline(
    "zero-shot-classification",
    model=model,
    tokenizer=tokenizer,
    device=-1
)


# ------------------------
# Extraction Functions
# ------------------------
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

    if not text.strip():  # OCR fallback
        ocr_text = ""
        doc = fitz.open(file_path)
        for page_num in range(len(doc)):
            page = doc[page_num]
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text += pytesseract.image_to_string(img) + "\n"
        text = ocr_text
    return text.strip()

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs]).strip()

def extract_text_from_image(file_path):
    return pytesseract.image_to_string(Image.open(file_path)).strip()

def check_grammar(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text != text

def extract_dates(text):
    date_patterns = [
        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
        r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b',
        r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b',
        r'\b\w+\s+\d{1,2},\s*\d{4}\b',
    ]
    dates_found = []
    for pattern in date_patterns:
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        dates_found.extend(matches)
    return list(set(dates_found))

def classify_dates(text, dates):
    issue_keywords = ["issued on", "dated", "notified on", "circular no"]
    event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]

    issue_dates, event_dates = [], []
    for d in dates:
        idx = text.lower().find(d.lower())
        if idx != -1:
            context = text[max(0, idx-60): idx+60].lower()
            if any(k in context for k in issue_keywords):
                issue_dates.append(d)
            elif any(k in context for k in event_keywords):
                after_text = text[idx: idx+80]
                match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
                event_dates.append(match.group().strip() if match else d)

    if not issue_dates and dates:
        issue_dates.append(dates[0])
    return issue_dates, event_dates

# ------------------------
# Verification Logic
# ------------------------
def verify_text(text, source_type="TEXT"):
    if not text.strip():
        return "--- Evidence Report ---\n\n❌ No readable text provided."

    # ------------------------
    # Heuristic Checks
    # ------------------------
    grammar_issue = check_grammar(text)
    dates = extract_dates(text)
    issue_dates, event_dates = classify_dates(text, dates)

    # Scam / fake indicators
    scam_keywords = [
        "bank details", "send money", "lottery", "win prize", 
        "transfer fee", "urgent", "click here", "claim", "scholarship $"
    ]
    scam_detected = any(kw in text.lower() for kw in scam_keywords)

    # Date consistency check
    contradiction = False
    if issue_dates and event_dates:
        try:
            from datetime import datetime
            fmt_variants = ["%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y", "%d %B %Y", "%B %d, %Y"]

            def parse_date(d):
                for fmt in fmt_variants:
                    try:
                        return datetime.strptime(d, fmt)
                    except Exception:
                        continue
                return None

            parsed_issue = parse_date(issue_dates[0])
            parsed_event = parse_date(event_dates[0])
            if parsed_issue and parsed_event and parsed_event < parsed_issue:
                contradiction = True
        except Exception:
            pass

    # ------------------------
    # Hugging Face Model
    # ------------------------
    labels = ["REAL", "FAKE"]
    result = classifier(text[:1000], candidate_labels=labels)
    model_label = result['labels'][0]
    model_confidence = result['scores'][0]

    # ------------------------
    # Final Verdict Logic
    # ------------------------
    final_label = model_label
    if scam_detected or contradiction or grammar_issue:
        # downgrade to FAKE if red flags appear
        final_label = "FAKE"

    # ------------------------
    # Report
    # ------------------------
    report = "πŸ“„ Evidence Report\n\n"
    report += "πŸ”Ž Document Analysis\n\n"
    report += f"Source: {source_type}\n\n"

    report += "βœ… Evidence Considered\n\n"
    if grammar_issue:
        report += "⚠️ Grammar/Spelling issues detected.\n"
    else:
        report += "No grammar issues detected.\n"

    if issue_dates: 
        report += f"πŸ“Œ Issue Date(s): {', '.join(issue_dates)}\n"
    if event_dates: 
        report += f"πŸ“Œ Event Date(s): {', '.join(event_dates)}\n"
    if not dates: 
        report += "No specific dates detected.\n"

    if contradiction:
        report += "⚠️ Date inconsistency detected (event before issue date).\n"
    if scam_detected:
        report += "⚠️ Scam-related keywords detected.\n"

    report += "\nFormatting and tone analyzed.\n\n"
    report += "🏁 Classification Result\n\n"
    report += f"Model Verdict: {model_label} ({model_confidence:.2f})\n"
    report += f"Final Verdict: {final_label}\n"

    return report

import tempfile
import os

def verify_document(file):
    if file is None:
        return "❌ Please upload a file or provide a file path."

    # Case 1: If input is a string (direct file path)
    if isinstance(file, str):
        file_path = file

    # Case 2: If input is an uploaded file (Streamlit/Colab)
    else:
        # Save to a temporary file
        suffix = os.path.splitext(file.name)[-1]
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
            tmp.write(file.read())
            file_path = tmp.name

    # Detect file type and extract
    ext = file_path.split('.')[-1].lower()
    if ext == "pdf":
        text = extract_text_from_pdf(file_path)
    elif ext == "docx":
        text = extract_text_from_docx(file_path)
    elif ext in ["png", "jpg", "jpeg"]:
        text = extract_text_from_image(file_path)
    else:
        return "❌ Unsupported file type."

    return verify_text(text, source_type=ext.upper())



def process_input(file, manual_text):
    if file is not None:
        return verify_document(file)
    elif manual_text.strip():
        return verify_text(manual_text, source_type="MANUAL TEXT")
    else:
        return "❌ Please upload a document or paste text first."

# ------------------------
# Streamlit UI
# ------------------------
# ------------------------
# Streamlit UI
# ------------------------
st.set_page_config(page_title="Document Verifier", layout="centered")
st.title("πŸ“‘ Document Authenticity Verifier")

uploaded_file = st.file_uploader(
    "Upload a document (PDF, DOCX, PNG, JPG)", 
    type=["pdf", "docx", "png", "jpg", "jpeg"]
)
manual_text = st.text_area("Or paste text manually")

# Button for uploaded files
if st.button("Verify Uploaded Document"):
    with st.spinner("Analyzing uploaded document..."):
        result = process_input(uploaded_file, "")
    st.text_area("Evidence Report", value=result, height=400)

# Button for manual text
if st.button("Verify Manual Text"):
    with st.spinner("Analyzing manual text..."):
        result = process_input(None, manual_text)
    st.text_area("Evidence Report", value=result, height=400)