Spaces:

fizzarif7
/

docVerifier

No application file

App Files Files Community

fizzarif7 commited on Aug 29, 2025

Commit

70bebd1

verified ·

1 Parent(s): 067155f

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -54

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
 import gradio as gr
 from transformers import pipeline
 import pdfplumber
@@ -27,8 +25,7 @@ def extract_text_from_pdf(file_path):
             if page_text:
                 text += page_text + "\n"
-    # OCR fallback if no text extracted
-    if not text.strip():
         ocr_text = ""
         doc = fitz.open(file_path)
         for page_num in range(len(doc)):
@@ -40,8 +37,8 @@ def extract_text_from_pdf(file_path):
     return text.strip()
 def extract_text_from_docx(file_path):
-    doc = docx.Document(file_path)
-    return "\n".join([p.text for p in doc.paragraphs]).strip()
 def extract_text_from_image(file_path):
     return pytesseract.image_to_string(Image.open(file_path)).strip()
@@ -68,9 +65,7 @@ def classify_dates(text, dates):
     issue_keywords = ["issued on", "dated", "notified on", "circular no"]
     event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
-    issue_dates = []
-    event_dates = []
     for d in dates:
         idx = text.lower().find(d.lower())
         if idx != -1:
@@ -80,14 +75,9 @@ def classify_dates(text, dates):
             elif any(k in context for k in event_keywords):
                 after_text = text[idx: idx+80]
                 match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
-                if match:
-                    event_dates.append(match.group().strip())
-                else:
-                    event_dates.append(d)
     if not issue_dates and dates:
         issue_dates.append(dates[0])
     return issue_dates, event_dates
 # ------------------------
@@ -96,78 +86,80 @@ def classify_dates(text, dates):
 def verify_text(text, source_type="TEXT"):
     if not text.strip():
         return "--- Evidence Report ---\n\n❌ No readable text provided."
     grammar_issue = check_grammar(text)
     dates = extract_dates(text)
     issue_dates, event_dates = classify_dates(text, dates)
     labels = ["REAL", "FAKE"]
     result = classifier(text[:1000], candidate_labels=labels)
     report = "📄 Evidence Report\n\n"
     report += "🔎 Document Analysis\n\n"
     report += f"Source: {source_type}\n\n"
     report += "✅ Evidence Considered\n\n"
-    if grammar_issue:
-        report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
-    else:
-        report += "No major grammar or spelling issues detected.\n\n"
     if issue_dates:
         report += f"📌 Document Issue Date(s): {', '.join(issue_dates)}\n"
     if event_dates:
         report += f"📌 Event/Holiday Date(s): {', '.join(event_dates)}\n"
     if not dates:
         report += "No specific dates were clearly detected.\n"
-    report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
-    report += "Signatures and registrar details align with standard official notices.\n\n"
     report += "🏁 Classification Result\n\n"
     report += f"Verdict: {result['labels'][0]}\n"
     report += f"Confidence: {result['scores'][0]:.2f}\n"
     return report
 def verify_document(file):
     file_path = file.name
     ext = file_path.split('.')[-1].lower()
     if ext == "pdf":
         text = extract_text_from_pdf(file_path)
     elif ext == "docx":
         text = extract_text_from_docx(file_path)
     elif ext in ["png", "jpg", "jpeg"]:
         text = extract_text_from_image(file_path)
     else:
-        return "Unsupported file type."
-    return verify_text(text, source_type=ext.upper())
 # ------------------------
-# Streamlit UI
 # ------------------------
-def process_input(file, manual_text):
-    if file is not None:
-        report = verify_document(file)
-        return report
-    elif manual_text.strip():
-        report = verify_text(manual_text, source_type="MANUAL TEXT")
-        return report
-    else:
-        return "❌ Please upload a document or paste text first."
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## 📑 Document Authenticity Verifier")
-    gr.Markdown("Upload a **PDF, DOCX, or Image**, OR paste raw **text** to verify authenticity.")
-    with gr.Row():
-        file_input = gr.File(label="Upload Document", file_types=[".pdf", ".docx", ".png", ".jpg", ".jpeg"])
-        text_input = gr.Textbox(label="Or paste the notification text here:", lines=10, placeholder="Paste text here...")
-    verify_btn = gr.Button("Verify Document")
-    output_box = gr.Textbox(label="Verification Report", lines=20)
-    verify_btn.click(fn=process_input, inputs=[file_input, text_input], outputs=output_box)
-# launch app
-demo.launch()

 import gradio as gr
 from transformers import pipeline
 import pdfplumber
             if page_text:
                 text += page_text + "\n"
+    if not text.strip():  # OCR fallback
         ocr_text = ""
         doc = fitz.open(file_path)
         for page_num in range(len(doc)):
     return text.strip()
 def extract_text_from_docx(file_path):
+    doc_file = docx.Document(file_path)
+    return "\n".join([p.text for p in doc_file.paragraphs]).strip()
 def extract_text_from_image(file_path):
     return pytesseract.image_to_string(Image.open(file_path)).strip()
     issue_keywords = ["issued on", "dated", "notified on", "circular no"]
     event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
+    issue_dates, event_dates = [], []
     for d in dates:
         idx = text.lower().find(d.lower())
         if idx != -1:
             elif any(k in context for k in event_keywords):
                 after_text = text[idx: idx+80]
                 match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
+                event_dates.append(match.group().strip() if match else d)
     if not issue_dates and dates:
         issue_dates.append(dates[0])
     return issue_dates, event_dates
 # ------------------------
 def verify_text(text, source_type="TEXT"):
     if not text.strip():
         return "--- Evidence Report ---\n\n❌ No readable text provided."
     grammar_issue = check_grammar(text)
     dates = extract_dates(text)
     issue_dates, event_dates = classify_dates(text, dates)
     labels = ["REAL", "FAKE"]
     result = classifier(text[:1000], candidate_labels=labels)
     report = "📄 Evidence Report\n\n"
     report += "🔎 Document Analysis\n\n"
     report += f"Source: {source_type}\n\n"
     report += "✅ Evidence Considered\n\n"
+    report += ("Minor grammar/spelling issues detected.\n\n" if grammar_issue
+               else "No major grammar or spelling issues detected.\n\n")
     if issue_dates:
         report += f"📌 Document Issue Date(s): {', '.join(issue_dates)}\n"
     if event_dates:
         report += f"📌 Event/Holiday Date(s): {', '.join(event_dates)}\n"
     if not dates:
         report += "No specific dates were clearly detected.\n"
+    report += "\nDocument formatting and tone resemble genuine notices.\n\n"
     report += "🏁 Classification Result\n\n"
     report += f"Verdict: {result['labels'][0]}\n"
     report += f"Confidence: {result['scores'][0]:.2f}\n"
     return report
 def verify_document(file):
+    if file is None:
+        return None, "❌ Please upload a file."
     file_path = file.name
     ext = file_path.split('.')[-1].lower()
     if ext == "pdf":
         text = extract_text_from_pdf(file_path)
+        preview = text[:1000] + ("..." if len(text) > 1000 else "")
     elif ext == "docx":
         text = extract_text_from_docx(file_path)
+        preview = text[:1000] + ("..." if len(text) > 1000 else "")
     elif ext in ["png", "jpg", "jpeg"]:
         text = extract_text_from_image(file_path)
+        preview = Image.open(file_path)  # show image preview
     else:
+        return None, "Unsupported file type."
+    return preview, verify_text(text, source_type=ext.upper())
+def process_text_input(manual_text):
+    if manual_text.strip():
+        return manual_text, verify_text(manual_text, source_type="MANUAL TEXT")
+    return None, "❌ Please paste some text first."
 # ------------------------
+# Gradio UI
 # ------------------------
+with gr.Blocks(theme=gr.themes.Soft(), css="""
+#report-box {background:#f9f9fb; border-radius:10px; padding:15px; box-shadow:0 2px 6px rgba(0,0,0,0.1);}
+#preview-box {background:#eef7ff; border-radius:10px; padding:15px; box-shadow:0 2px 6px rgba(0,0,0,0.1);}
+""") as demo:
     gr.Markdown("## 📑 Document Authenticity Verifier")
+    gr.Markdown("Choose an option below to verify your document:")
+    with gr.Tabs():
+        with gr.Tab("📂 Upload File"):
+            file_input = gr.File(label="Upload Document", file_types=[".pdf", ".docx", ".png", ".jpg", ".jpeg"])
+            preview_box = gr.Component(label="📄 File Preview", elem_id="preview-box")
+            report_box = gr.Textbox(label="Verification Report", lines=20, elem_id="report-box")
+            verify_btn_file = gr.Button("🔍 Verify Document")
+            verify_btn_file.click(fn=verify_document, inputs=file_input, outputs=[preview_box, report_box])
+        with gr.Tab("📝 Paste Text"):
+            text_input = gr.Textbox(label="Paste Notification Text", lines=10, placeholder="Paste text here...")
+            preview_text = gr.Textbox(label="Text Preview", lines=10, elem_id="preview-box")
+            report_box_text = gr.Textbox(label="Verification Report", lines=20, elem_id="report-box")
+            verify_btn_text = gr.Button("🔍 Verify Text")
+            verify_btn_text.click(fn=process_text_input, inputs=text_input, outputs=[preview_text, report_box_text])
+# ------------------------
+# Launch
+# ------------------------
+if __name__ == "__main__":
+    demo.launch()