ink85 commited on
Commit
04b6556
·
verified ·
1 Parent(s): 57cd347

🚀 Initial upload of Streamlit app

Browse files
Files changed (3) hide show
  1. Dockerfile +9 -13
  2. app.py +160 -0
  3. requirements.txt +7 -3
Dockerfile CHANGED
@@ -1,20 +1,16 @@
1
- FROM python:3.13.5-slim
 
2
 
 
3
  WORKDIR /app
4
 
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- git \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
 
 
14
  RUN pip3 install -r requirements.txt
15
 
16
- EXPOSE 8501
17
-
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
 
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
+ # Use a minimal base image with Python 3.9 installed
2
+ FROM python:3.9-slim
3
 
4
+ # Set the working directory inside the container to /app
5
  WORKDIR /app
6
 
7
+ # Copy all files from the current directory on the host to the container's /app directory
8
+ COPY . .
 
 
 
 
 
 
9
 
10
+ # Install Python dependencies listed in requirements.txt
11
  RUN pip3 install -r requirements.txt
12
 
13
+ # Define the command to run the Streamlit app on port 8501 and make it accessible externally
14
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]
 
15
 
16
+ # NOTE: Disable XSRF protection for easier external access in order to make batch predictions
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import json
4
+ import re
5
+ import fitz # PyMuPDF
6
+ from langdetect import detect, DetectorFactory
7
+ from googletrans import Translator
8
+ from transformers import pipeline
9
+
10
+ # for model serialization
11
+ import joblib
12
+
13
+ # for creating a folder
14
+ import os
15
+
16
+ # for hugging face space authentication to upload files
17
+ from huggingface_hub import login, HfApi
18
+
19
+ DetectorFactory.seed = 0
20
+
21
+ # Initialize Translator & Summarizer
22
+ # -------------------------
23
+ # Note: Initializing models here will load them when the app starts.
24
+ # Consider caching or lazy loading for performance in production.
25
+ translator = Translator()
26
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
27
+
28
+ # -------------------------
29
+ # Extract text from PDF
30
+ # -------------------------
31
+ def extract_text_from_pdf(pdf_file):
32
+ text = ""
33
+ # Save the uploaded file temporarily to process it with fitz
34
+ temp_file_path = "temp.pdf"
35
+ with open(temp_file_path, "wb") as f:
36
+ f.write(pdf_file.getvalue())
37
+
38
+ try:
39
+ with fitz.open(temp_file_path) as doc:
40
+ for page in doc:
41
+ text += page.get_text("text")
42
+ finally:
43
+ # Ensure the temporary file is removed
44
+ if os.path.exists(temp_file_path):
45
+ os.remove(temp_file_path)
46
+
47
+ return text.strip()
48
+
49
+ # Translate text to English using Google Translate
50
+ # -------------------------
51
+ def translate_text_google(text):
52
+ if not text:
53
+ return ""
54
+
55
+ max_chunk = 5000 # Google Translate handles large text but splitting is safer
56
+ chunks = [text[i:i+max_chunk] for i in range(0, len(text), max_chunk)]
57
+ translations = []
58
+ for chunk in chunks:
59
+ translated = translator.translate(chunk, dest='en')
60
+ translations.append(translated.text)
61
+ return " ".join(translations)
62
+
63
+ # Summarize text safely
64
+ # -------------------------
65
+ def safe_summarize(text, max_length=150, min_length=30):
66
+ if not text or len(text.split()) < 10:
67
+ return text # too short to summarize
68
+ try:
69
+ summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
70
+ return summary[0]['summary_text']
71
+ except Exception as e:
72
+ st.warning(f"⚠️ Summarization failed: {e}")
73
+ return text
74
+
75
+ # Extract entities
76
+ # -------------------------
77
+ def extract_entities(text):
78
+ entities = {}
79
+
80
+ # PAN format: AAAAA9999A
81
+ pan_match = re.search(r"\b[A-Z]{5}\d{4}[A-Z]\b", text, re.IGNORECASE)
82
+
83
+ # Account Number
84
+ acc_match = re.search(r"account\s*number\s*[:\-]?\s*([A-Za-z0-9]+)", text, re.IGNORECASE)
85
+
86
+ # Penalty (accepts 'penalty' or 'penalties')
87
+ penalty_match = re.search(r"\bpenalt(?:y|ies)\s*[:\-]?\s*([\d,]+)", text, re.IGNORECASE)
88
+
89
+ # Deactivation keywords
90
+ deactivate_match = re.search(r"\bdeactivat(?:e|ed|ion)\b", text, re.IGNORECASE)
91
+
92
+ if pan_match:
93
+ entities["PAN"] = pan_match.group(0).upper()
94
+ if acc_match:
95
+ entities["Account_Number"] = acc_match.group(1)
96
+ if penalty_match:
97
+ entities["Penalty"] = penalty_match.group(1).replace(",", "")
98
+ if deactivate_match:
99
+ entities["Deactivate"] = deactivate_match.group(0).lower()
100
+
101
+ return entities
102
+
103
+ # Trigger actions
104
+ # -------------------------
105
+ def trigger_action(entities):
106
+ if "Penalty" in entities:
107
+ return f"Penalty of {entities['Penalty']} recorded for account {entities.get('Account_Number', 'N/A')} (PAN: {entities.get('PAN', 'N/A')})"
108
+ elif "Deactivate" in entities:
109
+ return f"Kindy Deactivate {entities.get('Account_Number', 'N/A')} as per request having (PAN: {entities.get('PAN', 'N/A')})"
110
+ elif "Account_Number" in entities:
111
+ return f"Account {entities['Account_Number']} flagged for review."
112
+ else:
113
+ return "No action required"
114
+
115
+ # Process single PDF - adapted for Streamlit FileUploader
116
+ # -------------------------
117
+ def process_uploaded_pdf(pdf_file):
118
+ raw_text = extract_text_from_pdf(pdf_file)
119
+ lang = detect(raw_text)
120
+ translated_text = translate_text_google(raw_text) if lang != "en" else raw_text
121
+ summary = safe_summarize(translated_text)
122
+ entities = extract_entities(translated_text)
123
+ action_result = trigger_action(entities)
124
+
125
+ result = {
126
+ "file_name": pdf_file.name,
127
+ "detected_language": lang,
128
+ "raw_text_snippet": raw_text[:500] + ("..." if len(raw_text) > 500 else ""),
129
+ "translated_text_snippet": translated_text[:500] + ("..." if len(translated_text) > 500 else ""),
130
+ "summary": summary,
131
+ "entities": entities,
132
+ "action_triggered": action_result
133
+ }
134
+
135
+ return result
136
+
137
+
138
+ st.title("PDF Document Processor")
139
+ st.write("Upload a PDF file to extract text, translate (if needed), summarize, identify key entities, and suggest actions.")
140
+
141
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
142
+
143
+ if uploaded_file is not None:
144
+ st.write("Processing PDF...")
145
+ try:
146
+ processed_data = process_uploaded_pdf(uploaded_file)
147
+
148
+ st.subheader("Processing Results:")
149
+ st.write(f"**File Name:** {processed_data['file_name']}")
150
+ st.write(f"**Detected Language:** {processed_data['detected_language']}")
151
+ st.write(f"**Raw Text Snippet:** {processed_data['raw_text_snippet']}")
152
+ st.write(f"**Translated Text Snippet:** {processed_data['translated_text_snippet']}")
153
+ st.write(f"**Summary:** {processed_data['summary']}")
154
+ st.write(f"**Extracted Entities:**")
155
+ for key, value in processed_data['entities'].items():
156
+ st.write(f"- {key}: {value}")
157
+ st.write(f"**Action Triggered:** {processed_data['action_triggered']}")
158
+
159
+ except Exception as e:
160
+ st.error(f"An error occurred during processing: {e}")
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
1
+ streamlit==1.43.2
2
+ PyMuPDF==1.26.4
3
+ langdetect==1.0.9
4
+ googletrans==4.0.0-rc1
5
+ transformers==4.46.3
6
+ joblib==1.4.2
7
+ torch==2.8.0