ink85 commited on
Commit
cdfaada
·
verified ·
1 Parent(s): 92e16b2

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +158 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,160 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ import json
4
+ import re
5
+ import fitz # PyMuPDF
6
+ from langdetect import detect, DetectorFactory
7
+ from googletrans import Translator
8
+ from transformers import pipeline
9
 
10
+ # for model serialization
11
+ import joblib
12
+
13
+ # for creating a folder
14
+ import os
15
+
16
+ # for hugging face space authentication to upload files
17
+ from huggingface_hub import login, HfApi
18
+
19
+ DetectorFactory.seed = 0
20
+
21
+ # Initialize Translator & Summarizer
22
+ # -------------------------
23
+ # Note: Initializing models here will load them when the app starts.
24
+ # Consider caching or lazy loading for performance in production.
25
+ translator = Translator()
26
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
27
+
28
+ # -------------------------
29
+ # Extract text from PDF
30
+ # -------------------------
31
+ def extract_text_from_pdf(pdf_file):
32
+ text = ""
33
+ # Save the uploaded file temporarily to process it with fitz
34
+ temp_file_path = "temp.pdf"
35
+ with open(temp_file_path, "wb") as f:
36
+ f.write(pdf_file.getvalue())
37
+
38
+ try:
39
+ with fitz.open(temp_file_path) as doc:
40
+ for page in doc:
41
+ text += page.get_text("text")
42
+ finally:
43
+ # Ensure the temporary file is removed
44
+ if os.path.exists(temp_file_path):
45
+ os.remove(temp_file_path)
46
+
47
+ return text.strip()
48
+
49
+ # Translate text to English using Google Translate
50
+ # -------------------------
51
+ def translate_text_google(text):
52
+ if not text:
53
+ return ""
54
+
55
+ max_chunk = 5000 # Google Translate handles large text but splitting is safer
56
+ chunks = [text[i:i+max_chunk] for i in range(0, len(text), max_chunk)]
57
+ translations = []
58
+ for chunk in chunks:
59
+ translated = translator.translate(chunk, dest='en')
60
+ translations.append(translated.text)
61
+ return " ".join(translations)
62
+
63
+ # Summarize text safely
64
+ # -------------------------
65
+ def safe_summarize(text, max_length=150, min_length=30):
66
+ if not text or len(text.split()) < 10:
67
+ return text # too short to summarize
68
+ try:
69
+ summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
70
+ return summary[0]['summary_text']
71
+ except Exception as e:
72
+ st.warning(f"⚠️ Summarization failed: {e}")
73
+ return text
74
+
75
+ # Extract entities
76
+ # -------------------------
77
+ def extract_entities(text):
78
+ entities = {}
79
+
80
+ # PAN format: AAAAA9999A
81
+ pan_match = re.search(r"\b[A-Z]{5}\d{4}[A-Z]\b", text, re.IGNORECASE)
82
+
83
+ # Account Number
84
+ acc_match = re.search(r"account\s*number\s*[:\-]?\s*([A-Za-z0-9]+)", text, re.IGNORECASE)
85
+
86
+ # Penalty (accepts 'penalty' or 'penalties')
87
+ penalty_match = re.search(r"\bpenalt(?:y|ies)\s*[:\-]?\s*([\d,]+)", text, re.IGNORECASE)
88
+
89
+ # Deactivation keywords
90
+ deactivate_match = re.search(r"\bdeactivat(?:e|ed|ion)\b", text, re.IGNORECASE)
91
+
92
+ if pan_match:
93
+ entities["PAN"] = pan_match.group(0).upper()
94
+ if acc_match:
95
+ entities["Account_Number"] = acc_match.group(1)
96
+ if penalty_match:
97
+ entities["Penalty"] = penalty_match.group(1).replace(",", "")
98
+ if deactivate_match:
99
+ entities["Deactivate"] = deactivate_match.group(0).lower()
100
+
101
+ return entities
102
+
103
+ # Trigger actions
104
+ # -------------------------
105
+ def trigger_action(entities):
106
+ if "Penalty" in entities:
107
+ return f"Penalty of {entities['Penalty']} recorded for account {entities.get('Account_Number', 'N/A')} (PAN: {entities.get('PAN', 'N/A')})"
108
+ elif "Deactivate" in entities:
109
+ return f"Kindy Deactivate {entities.get('Account_Number', 'N/A')} as per request having (PAN: {entities.get('PAN', 'N/A')})"
110
+ elif "Account_Number" in entities:
111
+ return f"Account {entities['Account_Number']} flagged for review."
112
+ else:
113
+ return "No action required"
114
+
115
+ # Process single PDF - adapted for Streamlit FileUploader
116
+ # -------------------------
117
+ def process_uploaded_pdf(pdf_file):
118
+ raw_text = extract_text_from_pdf(pdf_file)
119
+ lang = detect(raw_text)
120
+ translated_text = translate_text_google(raw_text) if lang != "en" else raw_text
121
+ summary = safe_summarize(translated_text)
122
+ entities = extract_entities(translated_text)
123
+ action_result = trigger_action(entities)
124
+
125
+ result = {
126
+ "file_name": pdf_file.name,
127
+ "detected_language": lang,
128
+ "raw_text_snippet": raw_text[:500] + ("..." if len(raw_text) > 500 else ""),
129
+ "translated_text_snippet": translated_text[:500] + ("..." if len(translated_text) > 500 else ""),
130
+ "summary": summary,
131
+ "entities": entities,
132
+ "action_triggered": action_result
133
+ }
134
+
135
+ return result
136
+
137
+
138
+ st.title("PDF Document Processor")
139
+ st.write("Upload a PDF file to extract text, translate (if needed), summarize, identify key entities, and suggest actions.")
140
+
141
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
142
+
143
+ if uploaded_file is not None:
144
+ st.write("Processing PDF...")
145
+ try:
146
+ processed_data = process_uploaded_pdf(uploaded_file)
147
+
148
+ st.subheader("Processing Results:")
149
+ st.write(f"**File Name:** {processed_data['file_name']}")
150
+ st.write(f"**Detected Language:** {processed_data['detected_language']}")
151
+ st.write(f"**Raw Text Snippet:** {processed_data['raw_text_snippet']}")
152
+ st.write(f"**Translated Text Snippet:** {processed_data['translated_text_snippet']}")
153
+ st.write(f"**Summary:** {processed_data['summary']}")
154
+ st.write(f"**Extracted Entities:**")
155
+ for key, value in processed_data['entities'].items():
156
+ st.write(f"- {key}: {value}")
157
+ st.write(f"**Action Triggered:** {processed_data['action_triggered']}")
158
+
159
+ except Exception as e:
160
+ st.error(f"An error occurred during processing: {e}")