fizzarif7 commited on
Commit
7e5954b
Β·
verified Β·
1 Parent(s): 4b716ec

Upload news.py

Browse files
Files changed (1) hide show
  1. news.py +163 -0
news.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import pdfplumber
3
+ import docx
4
+ from PIL import Image
5
+ import pytesseract
6
+ from pdf2image import convert_from_path
7
+ from textblob import TextBlob
8
+ import re
9
+ import streamlit as st
10
+ # ------------------------------
11
+ # Initialize Zero-Shot Classifier
12
+ # ------------------------------
13
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
14
+
15
+ # ------------------------------
16
+ # Text Extraction
17
+ # ------------------------------
18
+ def extract_text_from_pdf(file_path):
19
+ text = ""
20
+ with pdfplumber.open(file_path) as pdf:
21
+ for page in pdf.pages:
22
+ page_text = page.extract_text()
23
+ if page_text:
24
+ text += page_text + "\n"
25
+
26
+ # OCR fallback
27
+ if not text.strip():
28
+ ocr_text = ""
29
+ images = convert_from_path(file_path)
30
+ for img in images:
31
+ ocr_text += pytesseract.image_to_string(img) + "\n"
32
+ text = ocr_text
33
+ return text.strip()
34
+
35
+ def extract_text_from_docx(file_path):
36
+ doc = docx.Document(file_path)
37
+ return "\n".join([p.text for p in doc.paragraphs]).strip()
38
+
39
+ def extract_text_from_image(file_path):
40
+ return pytesseract.image_to_string(Image.open(file_path)).strip()
41
+
42
+ # ------------------------------
43
+ # Grammar & Spelling (TextBlob)
44
+ # ------------------------------
45
+ def check_grammar(text):
46
+ blob = TextBlob(text)
47
+ corrected_text = str(blob.correct())
48
+ return corrected_text != text
49
+
50
+ # ------------------------------
51
+ # Date Extraction (Improved)
52
+ # ------------------------------
53
+ def extract_dates(text):
54
+ date_patterns = [
55
+ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # 28-05-2025 / 28/05/2025
56
+ r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b', # 28.05.2025
57
+ r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', # 28th May 2025
58
+ r'\b\w+\s+\d{1,2},\s*\d{4}\b', # May 28, 2025
59
+ ]
60
+
61
+ dates_found = []
62
+ for pattern in date_patterns:
63
+ matches = re.findall(pattern, text, flags=re.IGNORECASE)
64
+ dates_found.extend(matches)
65
+
66
+ return list(set(dates_found))
67
+
68
+ def classify_dates(text, dates):
69
+ issue_keywords = ["issued on", "dated", "notified on", "circular no"]
70
+ event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
71
+
72
+ issue_dates = []
73
+ event_dates = []
74
+
75
+ for d in dates:
76
+ idx = text.lower().find(d.lower())
77
+ if idx != -1:
78
+ context = text[max(0, idx-60): idx+60].lower()
79
+
80
+ if any(k in context for k in issue_keywords):
81
+ issue_dates.append(d)
82
+ elif any(k in context for k in event_keywords):
83
+ # Try to capture event/holiday name next to date
84
+ after_text = text[idx: idx+80]
85
+ match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
86
+ if match:
87
+ event_dates.append(match.group().strip())
88
+ else:
89
+ event_dates.append(d)
90
+
91
+ if not issue_dates and dates:
92
+ issue_dates.append(dates[0])
93
+
94
+ return issue_dates, event_dates
95
+
96
+ # ------------------------------
97
+ # Evidence & Classification
98
+ # ------------------------------
99
+ def verify_document(file_path):
100
+ ext = file_path.split('.')[-1].lower()
101
+ if ext == "pdf":
102
+ text = extract_text_from_pdf(file_path)
103
+ elif ext == "docx":
104
+ text = extract_text_from_docx(file_path)
105
+ elif ext in ["png", "jpg", "jpeg"]:
106
+ text = extract_text_from_image(file_path)
107
+ else:
108
+ return "Unsupported file type."
109
+
110
+ if not text.strip():
111
+ return "--- Evidence Report ---\n\n❌ No readable text was extracted from the document."
112
+
113
+ # Grammar & Spelling
114
+ grammar_issue = check_grammar(text)
115
+
116
+ # Dates
117
+ dates = extract_dates(text)
118
+ issue_dates, event_dates = classify_dates(text, dates)
119
+
120
+ # Classification
121
+ labels = ["REAL", "FAKE"]
122
+ result = classifier(text[:1000], candidate_labels=labels)
123
+
124
+ # Build Report
125
+ report = "πŸ“„ Evidence Report\n\n"
126
+ report += "πŸ”Ž Document Analysis\n\n"
127
+ report += f"File Type: {ext.upper()}\n"
128
+ report += "OCR Applied: " + ("Yes" if "ocr_text" in locals() else "No") + "\n\n"
129
+
130
+ report += "βœ… Evidence Considered\n\n"
131
+ if grammar_issue:
132
+ report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
133
+ else:
134
+ report += "No major grammar or spelling issues detected.\n\n"
135
+
136
+ if issue_dates:
137
+ report += f"πŸ“Œ Document Issue Date(s): {', '.join(issue_dates)}\n"
138
+ if event_dates:
139
+ report += f"πŸ“Œ Event/Holiday Date(s): {', '.join(event_dates)}\n"
140
+ if not dates:
141
+ report += "No specific dates were clearly detected.\n"
142
+
143
+ report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
144
+ report += "Signatures and registrar details align with standard official notices.\n\n"
145
+
146
+ report += "🏁 Classification Result\n\n"
147
+ report += f"Verdict: {result['labels'][0]}\n"
148
+ report += f"Confidence: {result['scores'][0]:.2f}\n"
149
+
150
+ return report
151
+
152
+
153
+
154
+ st.title("πŸ“„ Document Verifier")
155
+ st.write("Upload a PDF, DOCX, or Image to check authenticity.")
156
+
157
+ uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"])
158
+
159
+ if uploaded_file is not None:
160
+ with open(uploaded_file.name, "wb") as f:
161
+ f.write(uploaded_file.getbuffer())
162
+ result = verify_document(uploaded_file.name)
163
+ st.text_area("πŸ“‹ Evidence Report", result, height=400)