fizzarif7 commited on
Commit
e508a24
Β·
verified Β·
1 Parent(s): b5f9b9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -52
app.py CHANGED
@@ -1,23 +1,33 @@
1
- import os
2
- os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
3
- os.environ["STREAMLIT_SERVER_PORT"] = os.environ.get("PORT", "7860")
4
- os.environ["STREAMLIT_SERVER_ADDRESS"] = "0.0.0.0"
5
 
6
  import streamlit as st
7
- from transformers import pipeline
8
  import pdfplumber
9
  import docx
10
  from PIL import Image
11
- import pytesseract
12
  from textblob import TextBlob
13
  import re
14
- import fitz # βœ… PyMuPDF instead of pdf2image
15
- import os
 
 
16
 
17
  # ------------------------
18
  # Hugging Face Model
19
- # ------------------------
20
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # ------------------------
23
  # Extraction Functions
@@ -30,8 +40,7 @@ def extract_text_from_pdf(file_path):
30
  if page_text:
31
  text += page_text + "\n"
32
 
33
- # OCR fallback if no text extracted
34
- if not text.strip():
35
  ocr_text = ""
36
  doc = fitz.open(file_path)
37
  for page_num in range(len(doc)):
@@ -71,9 +80,7 @@ def classify_dates(text, dates):
71
  issue_keywords = ["issued on", "dated", "notified on", "circular no"]
72
  event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
73
 
74
- issue_dates = []
75
- event_dates = []
76
-
77
  for d in dates:
78
  idx = text.lower().find(d.lower())
79
  if idx != -1:
@@ -83,14 +90,10 @@ def classify_dates(text, dates):
83
  elif any(k in context for k in event_keywords):
84
  after_text = text[idx: idx+80]
85
  match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
86
- if match:
87
- event_dates.append(match.group().strip())
88
- else:
89
- event_dates.append(d)
90
 
91
  if not issue_dates and dates:
92
  issue_dates.append(dates[0])
93
-
94
  return issue_dates, event_dates
95
 
96
  # ------------------------
@@ -100,41 +103,110 @@ def verify_text(text, source_type="TEXT"):
100
  if not text.strip():
101
  return "--- Evidence Report ---\n\n❌ No readable text provided."
102
 
 
 
 
103
  grammar_issue = check_grammar(text)
104
  dates = extract_dates(text)
105
  issue_dates, event_dates = classify_dates(text, dates)
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  labels = ["REAL", "FAKE"]
108
  result = classifier(text[:1000], candidate_labels=labels)
 
 
109
 
 
 
 
 
 
 
 
 
 
 
 
110
  report = "πŸ“„ Evidence Report\n\n"
111
  report += "πŸ”Ž Document Analysis\n\n"
112
  report += f"Source: {source_type}\n\n"
113
 
114
  report += "βœ… Evidence Considered\n\n"
115
  if grammar_issue:
116
- report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
117
  else:
118
- report += "No major grammar or spelling issues detected.\n\n"
119
 
120
- if issue_dates:
121
- report += f"πŸ“Œ Document Issue Date(s): {', '.join(issue_dates)}\n"
122
- if event_dates:
123
- report += f"πŸ“Œ Event/Holiday Date(s): {', '.join(event_dates)}\n"
124
- if not dates:
125
- report += "No specific dates were clearly detected.\n"
126
 
127
- report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
128
- report += "Signatures and registrar details align with standard official notices.\n\n"
 
 
129
 
 
130
  report += "🏁 Classification Result\n\n"
131
- report += f"Verdict: {result['labels'][0]}\n"
132
- report += f"Confidence: {result['scores'][0]:.2f}\n"
133
 
134
  return report
135
 
 
 
 
136
  def verify_document(file):
137
- file_path = file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  ext = file_path.split('.')[-1].lower()
139
  if ext == "pdf":
140
  text = extract_text_from_pdf(file_path)
@@ -143,31 +215,44 @@ def verify_document(file):
143
  elif ext in ["png", "jpg", "jpeg"]:
144
  text = extract_text_from_image(file_path)
145
  else:
146
- return "Unsupported file type."
 
147
  return verify_text(text, source_type=ext.upper())
148
 
 
 
 
 
 
 
 
 
 
 
149
  # ------------------------
150
  # Streamlit UI
151
  # ------------------------
152
- st.set_page_config(page_title="πŸ“‘ Document Authenticity Verifier", layout="wide")
153
-
 
 
154
  st.title("πŸ“‘ Document Authenticity Verifier")
155
- st.write("Upload a **PDF, DOCX, or Image**, OR paste raw **text** to verify authenticity.")
156
 
157
- # File uploader
158
- uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "png", "jpg", "jpeg"])
 
 
 
159
 
160
- # Text input
161
- manual_text = st.text_area("Or paste the notification text here:")
 
 
 
 
 
 
 
 
 
162
 
163
- if st.button("Verify Document"):
164
- if uploaded_file is not None:
165
- with open(uploaded_file.name, "wb") as f:
166
- f.write(uploaded_file.getbuffer())
167
- report = verify_document(uploaded_file)
168
- st.text_area("Verification Report", report, height=400)
169
- elif manual_text.strip():
170
- report = verify_text(manual_text, source_type="MANUAL TEXT")
171
- st.text_area("Verification Report", report, height=400)
172
- else:
173
- st.warning("Please upload a document or paste text first.")
 
 
 
 
 
1
 
2
  import streamlit as st
3
+ from transformers import pipeline,AutoModelForSequenceClassification, AutoTokenizer
4
  import pdfplumber
5
  import docx
6
  from PIL import Image
7
+
8
  from textblob import TextBlob
9
  import re
10
+ import fitz
11
+ import pytesseract
12
+ pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
13
+
14
 
15
  # ------------------------
16
  # Hugging Face Model
17
+
18
+
19
+
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
22
+ model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
23
+
24
+ classifier = pipeline(
25
+ "zero-shot-classification",
26
+ model=model,
27
+ tokenizer=tokenizer,
28
+ device=-1
29
+ )
30
+
31
 
32
  # ------------------------
33
  # Extraction Functions
 
40
  if page_text:
41
  text += page_text + "\n"
42
 
43
+ if not text.strip(): # OCR fallback
 
44
  ocr_text = ""
45
  doc = fitz.open(file_path)
46
  for page_num in range(len(doc)):
 
80
  issue_keywords = ["issued on", "dated", "notified on", "circular no"]
81
  event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
82
 
83
+ issue_dates, event_dates = [], []
 
 
84
  for d in dates:
85
  idx = text.lower().find(d.lower())
86
  if idx != -1:
 
90
  elif any(k in context for k in event_keywords):
91
  after_text = text[idx: idx+80]
92
  match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
93
+ event_dates.append(match.group().strip() if match else d)
 
 
 
94
 
95
  if not issue_dates and dates:
96
  issue_dates.append(dates[0])
 
97
  return issue_dates, event_dates
98
 
99
  # ------------------------
 
103
  if not text.strip():
104
  return "--- Evidence Report ---\n\n❌ No readable text provided."
105
 
106
+ # ------------------------
107
+ # Heuristic Checks
108
+ # ------------------------
109
  grammar_issue = check_grammar(text)
110
  dates = extract_dates(text)
111
  issue_dates, event_dates = classify_dates(text, dates)
112
 
113
+ # Scam / fake indicators
114
+ scam_keywords = [
115
+ "bank details", "send money", "lottery", "win prize",
116
+ "transfer fee", "urgent", "click here", "claim", "scholarship $"
117
+ ]
118
+ scam_detected = any(kw in text.lower() for kw in scam_keywords)
119
+
120
+ # Date consistency check
121
+ contradiction = False
122
+ if issue_dates and event_dates:
123
+ try:
124
+ from datetime import datetime
125
+ fmt_variants = ["%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y", "%d %B %Y", "%B %d, %Y"]
126
+
127
+ def parse_date(d):
128
+ for fmt in fmt_variants:
129
+ try:
130
+ return datetime.strptime(d, fmt)
131
+ except Exception:
132
+ continue
133
+ return None
134
+
135
+ parsed_issue = parse_date(issue_dates[0])
136
+ parsed_event = parse_date(event_dates[0])
137
+ if parsed_issue and parsed_event and parsed_event < parsed_issue:
138
+ contradiction = True
139
+ except Exception:
140
+ pass
141
+
142
+ # ------------------------
143
+ # Hugging Face Model
144
+ # ------------------------
145
  labels = ["REAL", "FAKE"]
146
  result = classifier(text[:1000], candidate_labels=labels)
147
+ model_label = result['labels'][0]
148
+ model_confidence = result['scores'][0]
149
 
150
+ # ------------------------
151
+ # Final Verdict Logic
152
+ # ------------------------
153
+ final_label = model_label
154
+ if scam_detected or contradiction or grammar_issue:
155
+ # downgrade to FAKE if red flags appear
156
+ final_label = "FAKE"
157
+
158
+ # ------------------------
159
+ # Report
160
+ # ------------------------
161
  report = "πŸ“„ Evidence Report\n\n"
162
  report += "πŸ”Ž Document Analysis\n\n"
163
  report += f"Source: {source_type}\n\n"
164
 
165
  report += "βœ… Evidence Considered\n\n"
166
  if grammar_issue:
167
+ report += "⚠️ Grammar/Spelling issues detected.\n"
168
  else:
169
+ report += "No grammar issues detected.\n"
170
 
171
+ if issue_dates:
172
+ report += f"πŸ“Œ Issue Date(s): {', '.join(issue_dates)}\n"
173
+ if event_dates:
174
+ report += f"πŸ“Œ Event Date(s): {', '.join(event_dates)}\n"
175
+ if not dates:
176
+ report += "No specific dates detected.\n"
177
 
178
+ if contradiction:
179
+ report += "⚠️ Date inconsistency detected (event before issue date).\n"
180
+ if scam_detected:
181
+ report += "⚠️ Scam-related keywords detected.\n"
182
 
183
+ report += "\nFormatting and tone analyzed.\n\n"
184
  report += "🏁 Classification Result\n\n"
185
+ report += f"Model Verdict: {model_label} ({model_confidence:.2f})\n"
186
+ report += f"Final Verdict: {final_label}\n"
187
 
188
  return report
189
 
190
+ import tempfile
191
+ import os
192
+
193
  def verify_document(file):
194
+ if file is None:
195
+ return "❌ Please upload a file or provide a file path."
196
+
197
+ # Case 1: If input is a string (direct file path)
198
+ if isinstance(file, str):
199
+ file_path = file
200
+
201
+ # Case 2: If input is an uploaded file (Streamlit/Colab)
202
+ else:
203
+ # Save to a temporary file
204
+ suffix = os.path.splitext(file.name)[-1]
205
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
206
+ tmp.write(file.read())
207
+ file_path = tmp.name
208
+
209
+ # Detect file type and extract
210
  ext = file_path.split('.')[-1].lower()
211
  if ext == "pdf":
212
  text = extract_text_from_pdf(file_path)
 
215
  elif ext in ["png", "jpg", "jpeg"]:
216
  text = extract_text_from_image(file_path)
217
  else:
218
+ return "❌ Unsupported file type."
219
+
220
  return verify_text(text, source_type=ext.upper())
221
 
222
+
223
+
224
+ def process_input(file, manual_text):
225
+ if file is not None:
226
+ return verify_document(file)
227
+ elif manual_text.strip():
228
+ return verify_text(manual_text, source_type="MANUAL TEXT")
229
+ else:
230
+ return "❌ Please upload a document or paste text first."
231
+
232
  # ------------------------
233
  # Streamlit UI
234
  # ------------------------
235
+ # ------------------------
236
+ # Streamlit UI
237
+ # ------------------------
238
+ st.set_page_config(page_title="Document Verifier", layout="centered")
239
  st.title("πŸ“‘ Document Authenticity Verifier")
 
240
 
241
+ uploaded_file = st.file_uploader(
242
+ "Upload a document (PDF, DOCX, PNG, JPG)",
243
+ type=["pdf", "docx", "png", "jpg", "jpeg"]
244
+ )
245
+ manual_text = st.text_area("Or paste text manually")
246
 
247
+ # Button for uploaded files
248
+ if st.button("Verify Uploaded Document"):
249
+ with st.spinner("Analyzing uploaded document..."):
250
+ result = process_input(uploaded_file, "")
251
+ st.text_area("Evidence Report", value=result, height=400)
252
+
253
+ # Button for manual text
254
+ if st.button("Verify Manual Text"):
255
+ with st.spinner("Analyzing manual text..."):
256
+ result = process_input(None, manual_text)
257
+ st.text_area("Evidence Report", value=result, height=400)
258