Spaces:

fizzarif7
/

DocumentVerifier

Sleeping

App Files Files Community

DocumentVerifier / app.py

fizzarif7

Update app.py

0056d37 verified 5 months ago

raw

history blame contribute delete

5.81 kB

	from transformers import pipeline
	import pdfplumber
	import docx
	from PIL import Image
	import pytesseract
	from pdf2image import convert_from_path
	from textblob import TextBlob
	import re
	import streamlit as st

	# ------------------------------
	# Initialize Zero-Shot Classifier
	# ------------------------------
	classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

	# ------------------------------
	# Text Extraction
	# ------------------------------
	def extract_text_from_pdf(file_path):
	text = ""
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	# OCR fallback
	if not text.strip():
	ocr_text = ""
	images = convert_from_path(file_path)
	for img in images:
	ocr_text += pytesseract.image_to_string(img) + "\n"
	text = ocr_text
	return text.strip()

	def extract_text_from_docx(file_path):
	doc = docx.Document(file_path)
	return "\n".join([p.text for p in doc.paragraphs]).strip()

	def extract_text_from_image(file_path):
	return pytesseract.image_to_string(Image.open(file_path)).strip()

	# ------------------------------
	# Grammar & Spelling (TextBlob)
	# ------------------------------
	def check_grammar(text):
	blob = TextBlob(text)
	corrected_text = str(blob.correct())
	return corrected_text != text

	# ------------------------------
	# Date Extraction (Improved)
	# ------------------------------
	def extract_dates(text):
	date_patterns = [
	r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # 28-05-2025 / 28/05/2025
	r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b', # 28.05.2025
	r'\b\d{1,2}(?:st\|nd\|rd\|th)?\s+\w+\s,?\s\d{2,4}\b', # 28th May 2025
	r'\b\w+\s+\d{1,2},\s*\d{4}\b', # May 28, 2025
	]

	dates_found = []
	for pattern in date_patterns:
	matches = re.findall(pattern, text, flags=re.IGNORECASE)
	dates_found.extend(matches)

	return list(set(dates_found))

	def classify_dates(text, dates):
	issue_keywords = ["issued on", "dated", "notified on", "circular no"]
	event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]

	issue_dates = []
	event_dates = []

	for d in dates:
	idx = text.lower().find(d.lower())
	if idx != -1:
	context = text[max(0, idx-60): idx+60].lower()

	if any(k in context for k in issue_keywords):
	issue_dates.append(d)
	elif any(k in context for k in event_keywords):
	# Try to capture event/holiday name next to date
	after_text = text[idx: idx+80]
	match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
	if match:
	event_dates.append(match.group().strip())
	else:
	event_dates.append(d)

	if not issue_dates and dates:
	issue_dates.append(dates[0])

	return issue_dates, event_dates

	# ------------------------------
	# Verification Core
	# ------------------------------
	def verify_text(text, source_type="TEXT"):
	if not text.strip():
	return "--- Evidence Report ---\n\n❌ No readable text provided."

	# Grammar & Spelling
	grammar_issue = check_grammar(text)

	# Dates
	dates = extract_dates(text)
	issue_dates, event_dates = classify_dates(text, dates)

	# Classification
	labels = ["REAL", "FAKE"]
	result = classifier(text[:1000], candidate_labels=labels)

	# Build Report
	report = "📄 Evidence Report\n\n"
	report += "🔎 Document Analysis\n\n"
	report += f"Source: {source_type}\n\n"

	report += "✅ Evidence Considered\n\n"
	if grammar_issue:
	report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
	else:
	report += "No major grammar or spelling issues detected.\n\n"

	if issue_dates:
	report += f"📌 Document Issue Date(s): {', '.join(issue_dates)}\n"
	if event_dates:
	report += f"📌 Event/Holiday Date(s): {', '.join(event_dates)}\n"
	if not dates:
	report += "No specific dates were clearly detected.\n"

	report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
	report += "Signatures and registrar details align with standard official notices.\n\n"

	report += "🏁 Classification Result\n\n"
	report += f"Verdict: {result['labels'][0]}\n"
	report += f"Confidence: {result['scores'][0]:.2f}\n"

	return report

	def verify_document(file_path):
	ext = file_path.split('.')[-1].lower()
	if ext == "pdf":
	text = extract_text_from_pdf(file_path)
	elif ext == "docx":
	text = extract_text_from_docx(file_path)
	elif ext in ["png", "jpg", "jpeg"]:
	text = extract_text_from_image(file_path)
	else:
	return "Unsupported file type."

	return verify_text(text, source_type=ext.upper())

	# ------------------------------
	# Streamlit UI
	# ------------------------------
	st.title("📄 Document Verifier")
	st.write("Upload a PDF, DOCX, Image, or paste text to check authenticity.")

	# File Upload
	uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"])

	# Text Input
	pasted_text = st.text_area("Or paste text below:", height=200)

	# Verify File
	if uploaded_file is not None:
	with open(uploaded_file.name, "wb") as f:
	f.write(uploaded_file.getbuffer())
	result = verify_document(uploaded_file.name)
	st.text_area("📋 Evidence Report", result, height=400)

	# Verify Text
	elif pasted_text.strip():
	result = verify_text(pasted_text, source_type="PASTED TEXT")
	st.text_area("📋 Evidence Report", result, height=400)