fizzarif7 commited on
Commit
70bebd1
Β·
verified Β·
1 Parent(s): 067155f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -54
app.py CHANGED
@@ -1,5 +1,3 @@
1
-
2
-
3
  import gradio as gr
4
  from transformers import pipeline
5
  import pdfplumber
@@ -27,8 +25,7 @@ def extract_text_from_pdf(file_path):
27
  if page_text:
28
  text += page_text + "\n"
29
 
30
- # OCR fallback if no text extracted
31
- if not text.strip():
32
  ocr_text = ""
33
  doc = fitz.open(file_path)
34
  for page_num in range(len(doc)):
@@ -40,8 +37,8 @@ def extract_text_from_pdf(file_path):
40
  return text.strip()
41
 
42
  def extract_text_from_docx(file_path):
43
- doc = docx.Document(file_path)
44
- return "\n".join([p.text for p in doc.paragraphs]).strip()
45
 
46
  def extract_text_from_image(file_path):
47
  return pytesseract.image_to_string(Image.open(file_path)).strip()
@@ -68,9 +65,7 @@ def classify_dates(text, dates):
68
  issue_keywords = ["issued on", "dated", "notified on", "circular no"]
69
  event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
70
 
71
- issue_dates = []
72
- event_dates = []
73
-
74
  for d in dates:
75
  idx = text.lower().find(d.lower())
76
  if idx != -1:
@@ -80,14 +75,9 @@ def classify_dates(text, dates):
80
  elif any(k in context for k in event_keywords):
81
  after_text = text[idx: idx+80]
82
  match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
83
- if match:
84
- event_dates.append(match.group().strip())
85
- else:
86
- event_dates.append(d)
87
-
88
  if not issue_dates and dates:
89
  issue_dates.append(dates[0])
90
-
91
  return issue_dates, event_dates
92
 
93
  # ------------------------
@@ -96,78 +86,80 @@ def classify_dates(text, dates):
96
  def verify_text(text, source_type="TEXT"):
97
  if not text.strip():
98
  return "--- Evidence Report ---\n\n❌ No readable text provided."
99
-
100
  grammar_issue = check_grammar(text)
101
  dates = extract_dates(text)
102
  issue_dates, event_dates = classify_dates(text, dates)
103
-
104
  labels = ["REAL", "FAKE"]
105
  result = classifier(text[:1000], candidate_labels=labels)
106
 
107
  report = "πŸ“„ Evidence Report\n\n"
108
  report += "πŸ”Ž Document Analysis\n\n"
109
  report += f"Source: {source_type}\n\n"
110
-
111
  report += "βœ… Evidence Considered\n\n"
112
- if grammar_issue:
113
- report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
114
- else:
115
- report += "No major grammar or spelling issues detected.\n\n"
116
-
117
  if issue_dates:
118
  report += f"πŸ“Œ Document Issue Date(s): {', '.join(issue_dates)}\n"
119
  if event_dates:
120
  report += f"πŸ“Œ Event/Holiday Date(s): {', '.join(event_dates)}\n"
121
  if not dates:
122
  report += "No specific dates were clearly detected.\n"
123
-
124
- report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
125
- report += "Signatures and registrar details align with standard official notices.\n\n"
126
-
127
  report += "🏁 Classification Result\n\n"
128
  report += f"Verdict: {result['labels'][0]}\n"
129
  report += f"Confidence: {result['scores'][0]:.2f}\n"
130
-
131
  return report
132
 
133
  def verify_document(file):
 
 
134
  file_path = file.name
135
  ext = file_path.split('.')[-1].lower()
136
  if ext == "pdf":
137
  text = extract_text_from_pdf(file_path)
 
138
  elif ext == "docx":
139
  text = extract_text_from_docx(file_path)
 
140
  elif ext in ["png", "jpg", "jpeg"]:
141
  text = extract_text_from_image(file_path)
 
142
  else:
143
- return "Unsupported file type."
144
- return verify_text(text, source_type=ext.upper())
 
 
 
 
 
145
 
146
  # ------------------------
147
- # Streamlit UI
148
  # ------------------------
149
- def process_input(file, manual_text):
150
- if file is not None:
151
- report = verify_document(file)
152
- return report
153
- elif manual_text.strip():
154
- report = verify_text(manual_text, source_type="MANUAL TEXT")
155
- return report
156
- else:
157
- return "❌ Please upload a document or paste text first."
158
-
159
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
160
  gr.Markdown("## πŸ“‘ Document Authenticity Verifier")
161
- gr.Markdown("Upload a **PDF, DOCX, or Image**, OR paste raw **text** to verify authenticity.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- with gr.Row():
164
- file_input = gr.File(label="Upload Document", file_types=[".pdf", ".docx", ".png", ".jpg", ".jpeg"])
165
- text_input = gr.Textbox(label="Or paste the notification text here:", lines=10, placeholder="Paste text here...")
166
-
167
- verify_btn = gr.Button("Verify Document")
168
- output_box = gr.Textbox(label="Verification Report", lines=20)
169
-
170
- verify_btn.click(fn=process_input, inputs=[file_input, text_input], outputs=output_box)
171
-
172
- # launch app
173
- demo.launch()
 
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  import pdfplumber
 
25
  if page_text:
26
  text += page_text + "\n"
27
 
28
+ if not text.strip(): # OCR fallback
 
29
  ocr_text = ""
30
  doc = fitz.open(file_path)
31
  for page_num in range(len(doc)):
 
37
  return text.strip()
38
 
39
  def extract_text_from_docx(file_path):
40
+ doc_file = docx.Document(file_path)
41
+ return "\n".join([p.text for p in doc_file.paragraphs]).strip()
42
 
43
  def extract_text_from_image(file_path):
44
  return pytesseract.image_to_string(Image.open(file_path)).strip()
 
65
  issue_keywords = ["issued on", "dated", "notified on", "circular no"]
66
  event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
67
 
68
+ issue_dates, event_dates = [], []
 
 
69
  for d in dates:
70
  idx = text.lower().find(d.lower())
71
  if idx != -1:
 
75
  elif any(k in context for k in event_keywords):
76
  after_text = text[idx: idx+80]
77
  match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
78
+ event_dates.append(match.group().strip() if match else d)
 
 
 
 
79
  if not issue_dates and dates:
80
  issue_dates.append(dates[0])
 
81
  return issue_dates, event_dates
82
 
83
  # ------------------------
 
86
  def verify_text(text, source_type="TEXT"):
87
  if not text.strip():
88
  return "--- Evidence Report ---\n\n❌ No readable text provided."
 
89
  grammar_issue = check_grammar(text)
90
  dates = extract_dates(text)
91
  issue_dates, event_dates = classify_dates(text, dates)
 
92
  labels = ["REAL", "FAKE"]
93
  result = classifier(text[:1000], candidate_labels=labels)
94
 
95
  report = "πŸ“„ Evidence Report\n\n"
96
  report += "πŸ”Ž Document Analysis\n\n"
97
  report += f"Source: {source_type}\n\n"
 
98
  report += "βœ… Evidence Considered\n\n"
99
+ report += ("Minor grammar/spelling issues detected.\n\n" if grammar_issue
100
+ else "No major grammar or spelling issues detected.\n\n")
 
 
 
101
  if issue_dates:
102
  report += f"πŸ“Œ Document Issue Date(s): {', '.join(issue_dates)}\n"
103
  if event_dates:
104
  report += f"πŸ“Œ Event/Holiday Date(s): {', '.join(event_dates)}\n"
105
  if not dates:
106
  report += "No specific dates were clearly detected.\n"
107
+ report += "\nDocument formatting and tone resemble genuine notices.\n\n"
 
 
 
108
  report += "🏁 Classification Result\n\n"
109
  report += f"Verdict: {result['labels'][0]}\n"
110
  report += f"Confidence: {result['scores'][0]:.2f}\n"
 
111
  return report
112
 
113
  def verify_document(file):
114
+ if file is None:
115
+ return None, "❌ Please upload a file."
116
  file_path = file.name
117
  ext = file_path.split('.')[-1].lower()
118
  if ext == "pdf":
119
  text = extract_text_from_pdf(file_path)
120
+ preview = text[:1000] + ("..." if len(text) > 1000 else "")
121
  elif ext == "docx":
122
  text = extract_text_from_docx(file_path)
123
+ preview = text[:1000] + ("..." if len(text) > 1000 else "")
124
  elif ext in ["png", "jpg", "jpeg"]:
125
  text = extract_text_from_image(file_path)
126
+ preview = Image.open(file_path) # show image preview
127
  else:
128
+ return None, "Unsupported file type."
129
+ return preview, verify_text(text, source_type=ext.upper())
130
+
131
+ def process_text_input(manual_text):
132
+ if manual_text.strip():
133
+ return manual_text, verify_text(manual_text, source_type="MANUAL TEXT")
134
+ return None, "❌ Please paste some text first."
135
 
136
  # ------------------------
137
+ # Gradio UI
138
  # ------------------------
139
+ with gr.Blocks(theme=gr.themes.Soft(), css="""
140
+ #report-box {background:#f9f9fb; border-radius:10px; padding:15px; box-shadow:0 2px 6px rgba(0,0,0,0.1);}
141
+ #preview-box {background:#eef7ff; border-radius:10px; padding:15px; box-shadow:0 2px 6px rgba(0,0,0,0.1);}
142
+ """) as demo:
 
 
 
 
 
 
 
143
  gr.Markdown("## πŸ“‘ Document Authenticity Verifier")
144
+ gr.Markdown("Choose an option below to verify your document:")
145
+
146
+ with gr.Tabs():
147
+ with gr.Tab("πŸ“‚ Upload File"):
148
+ file_input = gr.File(label="Upload Document", file_types=[".pdf", ".docx", ".png", ".jpg", ".jpeg"])
149
+ preview_box = gr.Component(label="πŸ“„ File Preview", elem_id="preview-box")
150
+ report_box = gr.Textbox(label="Verification Report", lines=20, elem_id="report-box")
151
+ verify_btn_file = gr.Button("πŸ” Verify Document")
152
+ verify_btn_file.click(fn=verify_document, inputs=file_input, outputs=[preview_box, report_box])
153
+
154
+ with gr.Tab("πŸ“ Paste Text"):
155
+ text_input = gr.Textbox(label="Paste Notification Text", lines=10, placeholder="Paste text here...")
156
+ preview_text = gr.Textbox(label="Text Preview", lines=10, elem_id="preview-box")
157
+ report_box_text = gr.Textbox(label="Verification Report", lines=20, elem_id="report-box")
158
+ verify_btn_text = gr.Button("πŸ” Verify Text")
159
+ verify_btn_text.click(fn=process_text_input, inputs=text_input, outputs=[preview_text, report_box_text])
160
 
161
+ # ------------------------
162
+ # Launch
163
+ # ------------------------
164
+ if __name__ == "__main__":
165
+ demo.launch()