fizzarif7 commited on
Commit
a33168f
Β·
verified Β·
1 Parent(s): 1c9ee4b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+ import pdfplumber
4
+ import docx
5
+ from PIL import Image
6
+ import pytesseract
7
+ from textblob import TextBlob
8
+ import re
9
+ import fitz # βœ… PyMuPDF instead of pdf2image
10
+ import os
11
+
12
+ # ------------------------
13
+ # Hugging Face Model
14
+ # ------------------------
15
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
16
+
17
+ # ------------------------
18
+ # Extraction Functions
19
+ # ------------------------
20
+ def extract_text_from_pdf(file_path):
21
+ text = ""
22
+ with pdfplumber.open(file_path) as pdf:
23
+ for page in pdf.pages:
24
+ page_text = page.extract_text()
25
+ if page_text:
26
+ text += page_text + "\n"
27
+
28
+ # OCR fallback if no text extracted
29
+ if not text.strip():
30
+ ocr_text = ""
31
+ doc = fitz.open(file_path)
32
+ for page_num in range(len(doc)):
33
+ page = doc[page_num]
34
+ pix = page.get_pixmap()
35
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
36
+ ocr_text += pytesseract.image_to_string(img) + "\n"
37
+ text = ocr_text
38
+ return text.strip()
39
+
40
+ def extract_text_from_docx(file_path):
41
+ doc = docx.Document(file_path)
42
+ return "\n".join([p.text for p in doc.paragraphs]).strip()
43
+
44
+ def extract_text_from_image(file_path):
45
+ return pytesseract.image_to_string(Image.open(file_path)).strip()
46
+
47
+ def check_grammar(text):
48
+ blob = TextBlob(text)
49
+ corrected_text = str(blob.correct())
50
+ return corrected_text != text
51
+
52
+ def extract_dates(text):
53
+ date_patterns = [
54
+ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
55
+ r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b',
56
+ r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b',
57
+ r'\b\w+\s+\d{1,2},\s*\d{4}\b',
58
+ ]
59
+ dates_found = []
60
+ for pattern in date_patterns:
61
+ matches = re.findall(pattern, text, flags=re.IGNORECASE)
62
+ dates_found.extend(matches)
63
+ return list(set(dates_found))
64
+
65
+ def classify_dates(text, dates):
66
+ issue_keywords = ["issued on", "dated", "notified on", "circular no"]
67
+ event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
68
+
69
+ issue_dates = []
70
+ event_dates = []
71
+
72
+ for d in dates:
73
+ idx = text.lower().find(d.lower())
74
+ if idx != -1:
75
+ context = text[max(0, idx-60): idx+60].lower()
76
+ if any(k in context for k in issue_keywords):
77
+ issue_dates.append(d)
78
+ elif any(k in context for k in event_keywords):
79
+ after_text = text[idx: idx+80]
80
+ match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
81
+ if match:
82
+ event_dates.append(match.group().strip())
83
+ else:
84
+ event_dates.append(d)
85
+
86
+ if not issue_dates and dates:
87
+ issue_dates.append(dates[0])
88
+
89
+ return issue_dates, event_dates
90
+
91
+ # ------------------------
92
+ # Verification Logic
93
+ # ------------------------
94
+ def verify_text(text, source_type="TEXT"):
95
+ if not text.strip():
96
+ return "--- Evidence Report ---\n\n❌ No readable text provided."
97
+
98
+ grammar_issue = check_grammar(text)
99
+ dates = extract_dates(text)
100
+ issue_dates, event_dates = classify_dates(text, dates)
101
+
102
+ labels = ["REAL", "FAKE"]
103
+ result = classifier(text[:1000], candidate_labels=labels)
104
+
105
+ report = "πŸ“„ Evidence Report\n\n"
106
+ report += "πŸ”Ž Document Analysis\n\n"
107
+ report += f"Source: {source_type}\n\n"
108
+
109
+ report += "βœ… Evidence Considered\n\n"
110
+ if grammar_issue:
111
+ report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
112
+ else:
113
+ report += "No major grammar or spelling issues detected.\n\n"
114
+
115
+ if issue_dates:
116
+ report += f"πŸ“Œ Document Issue Date(s): {', '.join(issue_dates)}\n"
117
+ if event_dates:
118
+ report += f"πŸ“Œ Event/Holiday Date(s): {', '.join(event_dates)}\n"
119
+ if not dates:
120
+ report += "No specific dates were clearly detected.\n"
121
+
122
+ report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
123
+ report += "Signatures and registrar details align with standard official notices.\n\n"
124
+
125
+ report += "🏁 Classification Result\n\n"
126
+ report += f"Verdict: {result['labels'][0]}\n"
127
+ report += f"Confidence: {result['scores'][0]:.2f}\n"
128
+
129
+ return report
130
+
131
+ def verify_document(file):
132
+ if file is None:
133
+ return "❌ Please upload a file."
134
+ file_path = file.name
135
+ ext = file_path.split('.')[-1].lower()
136
+ if ext == "pdf":
137
+ text = extract_text_from_pdf(file_path)
138
+ elif ext == "docx":
139
+ text = extract_text_from_docx(file_path)
140
+ elif ext in ["png", "jpg", "jpeg"]:
141
+ text = extract_text_from_image(file_path)
142
+ else:
143
+ return "Unsupported file type."
144
+ return verify_text(text, source_type=ext.upper())
145
+
146
+ # ------------------------
147
+ # Streamlit UI
148
+ # ------------------------
149
+ st.set_page_config(page_title="πŸ“‘ Document Authenticity Verifier", layout="wide")
150
+
151
+ st.title("πŸ“‘ Document Authenticity Verifier")
152
+ st.write("Upload a **PDF, DOCX, or Image** to verify authenticity.")
153
+
154
+ uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "png", "jpg", "jpeg"])
155
+
156
+ if st.button("Verify Document"):
157
+ if uploaded_file is not None:
158
+ with open(uploaded_file.name, "wb") as f:
159
+ f.write(uploaded_file.getbuffer())
160
+ report = verify_document(uploaded_file)
161
+ st.text_area("Verification Report", report, height=400)
162
+ else:
163
+ st.warning("Please upload a document first.")