Update src/streamlit_app.py
Browse files- src/streamlit_app.py +158 -38
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,160 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
"
|
| 29 |
-
|
| 30 |
-
"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import re
|
| 5 |
+
import fitz # PyMuPDF
|
| 6 |
+
from langdetect import detect, DetectorFactory
|
| 7 |
+
from googletrans import Translator
|
| 8 |
+
from transformers import pipeline
|
| 9 |
|
| 10 |
+
# for model serialization
|
| 11 |
+
import joblib
|
| 12 |
+
|
| 13 |
+
# for creating a folder
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
# for hugging face space authentication to upload files
|
| 17 |
+
from huggingface_hub import login, HfApi
|
| 18 |
+
|
| 19 |
+
DetectorFactory.seed = 0
|
| 20 |
+
|
| 21 |
+
# Initialize Translator & Summarizer
|
| 22 |
+
# -------------------------
|
| 23 |
+
# Note: Initializing models here will load them when the app starts.
|
| 24 |
+
# Consider caching or lazy loading for performance in production.
|
| 25 |
+
translator = Translator()
|
| 26 |
+
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
| 27 |
+
|
| 28 |
+
# -------------------------
|
| 29 |
+
# Extract text from PDF
|
| 30 |
+
# -------------------------
|
| 31 |
+
def extract_text_from_pdf(pdf_file):
|
| 32 |
+
text = ""
|
| 33 |
+
# Save the uploaded file temporarily to process it with fitz
|
| 34 |
+
temp_file_path = "temp.pdf"
|
| 35 |
+
with open(temp_file_path, "wb") as f:
|
| 36 |
+
f.write(pdf_file.getvalue())
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
with fitz.open(temp_file_path) as doc:
|
| 40 |
+
for page in doc:
|
| 41 |
+
text += page.get_text("text")
|
| 42 |
+
finally:
|
| 43 |
+
# Ensure the temporary file is removed
|
| 44 |
+
if os.path.exists(temp_file_path):
|
| 45 |
+
os.remove(temp_file_path)
|
| 46 |
+
|
| 47 |
+
return text.strip()
|
| 48 |
+
|
| 49 |
+
# Translate text to English using Google Translate
|
| 50 |
+
# -------------------------
|
| 51 |
+
def translate_text_google(text):
|
| 52 |
+
if not text:
|
| 53 |
+
return ""
|
| 54 |
+
|
| 55 |
+
max_chunk = 5000 # Google Translate handles large text but splitting is safer
|
| 56 |
+
chunks = [text[i:i+max_chunk] for i in range(0, len(text), max_chunk)]
|
| 57 |
+
translations = []
|
| 58 |
+
for chunk in chunks:
|
| 59 |
+
translated = translator.translate(chunk, dest='en')
|
| 60 |
+
translations.append(translated.text)
|
| 61 |
+
return " ".join(translations)
|
| 62 |
+
|
| 63 |
+
# Summarize text safely
|
| 64 |
+
# -------------------------
|
| 65 |
+
def safe_summarize(text, max_length=150, min_length=30):
|
| 66 |
+
if not text or len(text.split()) < 10:
|
| 67 |
+
return text # too short to summarize
|
| 68 |
+
try:
|
| 69 |
+
summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
|
| 70 |
+
return summary[0]['summary_text']
|
| 71 |
+
except Exception as e:
|
| 72 |
+
st.warning(f"⚠️ Summarization failed: {e}")
|
| 73 |
+
return text
|
| 74 |
+
|
| 75 |
+
# Extract entities
|
| 76 |
+
# -------------------------
|
| 77 |
+
def extract_entities(text):
|
| 78 |
+
entities = {}
|
| 79 |
+
|
| 80 |
+
# PAN format: AAAAA9999A
|
| 81 |
+
pan_match = re.search(r"\b[A-Z]{5}\d{4}[A-Z]\b", text, re.IGNORECASE)
|
| 82 |
+
|
| 83 |
+
# Account Number
|
| 84 |
+
acc_match = re.search(r"account\s*number\s*[:\-]?\s*([A-Za-z0-9]+)", text, re.IGNORECASE)
|
| 85 |
+
|
| 86 |
+
# Penalty (accepts 'penalty' or 'penalties')
|
| 87 |
+
penalty_match = re.search(r"\bpenalt(?:y|ies)\s*[:\-]?\s*([\d,]+)", text, re.IGNORECASE)
|
| 88 |
+
|
| 89 |
+
# Deactivation keywords
|
| 90 |
+
deactivate_match = re.search(r"\bdeactivat(?:e|ed|ion)\b", text, re.IGNORECASE)
|
| 91 |
+
|
| 92 |
+
if pan_match:
|
| 93 |
+
entities["PAN"] = pan_match.group(0).upper()
|
| 94 |
+
if acc_match:
|
| 95 |
+
entities["Account_Number"] = acc_match.group(1)
|
| 96 |
+
if penalty_match:
|
| 97 |
+
entities["Penalty"] = penalty_match.group(1).replace(",", "")
|
| 98 |
+
if deactivate_match:
|
| 99 |
+
entities["Deactivate"] = deactivate_match.group(0).lower()
|
| 100 |
+
|
| 101 |
+
return entities
|
| 102 |
+
|
| 103 |
+
# Trigger actions
|
| 104 |
+
# -------------------------
|
| 105 |
+
def trigger_action(entities):
|
| 106 |
+
if "Penalty" in entities:
|
| 107 |
+
return f"Penalty of {entities['Penalty']} recorded for account {entities.get('Account_Number', 'N/A')} (PAN: {entities.get('PAN', 'N/A')})"
|
| 108 |
+
elif "Deactivate" in entities:
|
| 109 |
+
return f"Kindy Deactivate {entities.get('Account_Number', 'N/A')} as per request having (PAN: {entities.get('PAN', 'N/A')})"
|
| 110 |
+
elif "Account_Number" in entities:
|
| 111 |
+
return f"Account {entities['Account_Number']} flagged for review."
|
| 112 |
+
else:
|
| 113 |
+
return "No action required"
|
| 114 |
+
|
| 115 |
+
# Process single PDF - adapted for Streamlit FileUploader
|
| 116 |
+
# -------------------------
|
| 117 |
+
def process_uploaded_pdf(pdf_file):
|
| 118 |
+
raw_text = extract_text_from_pdf(pdf_file)
|
| 119 |
+
lang = detect(raw_text)
|
| 120 |
+
translated_text = translate_text_google(raw_text) if lang != "en" else raw_text
|
| 121 |
+
summary = safe_summarize(translated_text)
|
| 122 |
+
entities = extract_entities(translated_text)
|
| 123 |
+
action_result = trigger_action(entities)
|
| 124 |
+
|
| 125 |
+
result = {
|
| 126 |
+
"file_name": pdf_file.name,
|
| 127 |
+
"detected_language": lang,
|
| 128 |
+
"raw_text_snippet": raw_text[:500] + ("..." if len(raw_text) > 500 else ""),
|
| 129 |
+
"translated_text_snippet": translated_text[:500] + ("..." if len(translated_text) > 500 else ""),
|
| 130 |
+
"summary": summary,
|
| 131 |
+
"entities": entities,
|
| 132 |
+
"action_triggered": action_result
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
return result
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
st.title("PDF Document Processor")
|
| 139 |
+
st.write("Upload a PDF file to extract text, translate (if needed), summarize, identify key entities, and suggest actions.")
|
| 140 |
+
|
| 141 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
| 142 |
+
|
| 143 |
+
if uploaded_file is not None:
|
| 144 |
+
st.write("Processing PDF...")
|
| 145 |
+
try:
|
| 146 |
+
processed_data = process_uploaded_pdf(uploaded_file)
|
| 147 |
+
|
| 148 |
+
st.subheader("Processing Results:")
|
| 149 |
+
st.write(f"**File Name:** {processed_data['file_name']}")
|
| 150 |
+
st.write(f"**Detected Language:** {processed_data['detected_language']}")
|
| 151 |
+
st.write(f"**Raw Text Snippet:** {processed_data['raw_text_snippet']}")
|
| 152 |
+
st.write(f"**Translated Text Snippet:** {processed_data['translated_text_snippet']}")
|
| 153 |
+
st.write(f"**Summary:** {processed_data['summary']}")
|
| 154 |
+
st.write(f"**Extracted Entities:**")
|
| 155 |
+
for key, value in processed_data['entities'].items():
|
| 156 |
+
st.write(f"- {key}: {value}")
|
| 157 |
+
st.write(f"**Action Triggered:** {processed_data['action_triggered']}")
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
st.error(f"An error occurred during processing: {e}")
|