Sripriya16 commited on
Commit
c3a6f7a
·
verified ·
1 Parent(s): bde65a7

Upload 3 files

Browse files
Files changed (3) hide show
  1. Malayalam-en.txt +7 -0
  2. app.py +221 -0
  3. requirements.txt +11 -0
Malayalam-en.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Bilingual Test Document
2
+
3
+ This is the first line in English.
4
+ ഇതൊരു പരീക്ഷണ രേഖയാണ്.
5
+ This line should remain in English.
6
+ ഇതിൽ ഇംഗ്ലീഷും മലയാളവും അടങ്ങിയിരിക്കുന്നു.
7
+ This is the final English line.
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import os
4
+ import fitz # PyMuPDF
5
+ import fasttext
6
+ from huggingface_hub import hf_hub_download
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
8
+ from IndicTransToolkit.processor import IndicProcessor
9
+ import torch
10
+ from PIL import Image
11
+ import requests
12
+ import json
13
+ import gradio as gr
14
+
15
+ # --- CONFIG & SECRET MANAGEMENT ---
16
+ # IMPORTANT: Fetch the API key from Hugging Face Space Secrets
17
+ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
18
+
19
+ TRANSLATION_MODEL_REPO_ID = "ai4bharat/indictrans2-indic-en-1B"
20
+ OCR_MODEL_ID = "microsoft/trocr-base-printed"
21
+ LANGUAGE_TO_TRANSLATE = "mal"
22
+
23
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
+ print(f"Running on device: {DEVICE}")
25
+
26
+ # --- GLOBAL MODEL LOADING ---
27
+ # Load models once when the application starts to avoid reloading on every request.
28
+
29
+ print("Loading fastText language detector (this can take a moment)...")
30
+ ft_model_path = hf_hub_download(
31
+ repo_id="facebook/fasttext-language-identification",
32
+ filename="model.bin"
33
+ )
34
+ lang_detect_model = fasttext.load_model(ft_model_path)
35
+ print("✅ fastText loaded.")
36
+
37
+ print("Loading OCR model...")
38
+ # Using device=-1 forces OCR to CPU, which can be more stable in shared environments
39
+ ocr_pipeline = pipeline("image-to-text", model=OCR_MODEL_ID, device=-1)
40
+ print("✅ OCR loaded.")
41
+
42
+ print(f"Loading tokenizer & model: {TRANSLATION_MODEL_REPO_ID} ...")
43
+ tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL_REPO_ID, trust_remote_code=True)
44
+ translation_model = AutoModelForSeq2SeqLM.from_pretrained(
45
+ TRANSLATION_MODEL_REPO_ID,
46
+ trust_remote_code=True,
47
+ torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
48
+ ).to(DEVICE)
49
+ print("✅ Translation model loaded.")
50
+
51
+ # Initialize IndicProcessor
52
+ ip = IndicProcessor(inference=True)
53
+ print("✅ IndicProcessor initialized.")
54
+
55
+
56
+ # --- UTILITY FUNCTIONS (Unchanged from your script) ---
57
+
58
+ def extract_text_from_pdf(pdf_path):
59
+ try:
60
+ doc = fitz.open(pdf_path)
61
+ txt = ""
62
+ for p in doc:
63
+ txt += p.get_text("text") + "\n"
64
+ doc.close()
65
+ return txt
66
+ except Exception as e:
67
+ print(f"PDF extract error: {e}")
68
+ return None
69
+
70
+ def read_text_from_txt(path):
71
+ try:
72
+ with open(path, "r", encoding="utf-8") as f:
73
+ return f.read()
74
+ except Exception as e:
75
+ print(f"TXT read error: {e}")
76
+ return None
77
+
78
+ def extract_text_from_image(path):
79
+ try:
80
+ # The Gradio file object path can be passed directly to PIL
81
+ with Image.open(path) as img:
82
+ out = ocr_pipeline(img)
83
+ return out[0]["generated_text"] if out else ""
84
+ except Exception as e:
85
+ print(f"Image OCR error: {e}")
86
+ return None
87
+
88
+ def detect_language(text_snippet):
89
+ s = text_snippet.replace("\n", " ").strip()
90
+ if not s: return None
91
+ preds = lang_detect_model.predict(s, k=1)
92
+ if preds and preds[0]:
93
+ label = preds[0][0]
94
+ code = label.split("__")[-1]
95
+ return code
96
+ return None
97
+
98
+ def translate_chunk(chunk, src_lang="mal_Mlym", tgt_lang="eng_Latn"):
99
+ if not chunk.strip(): return ""
100
+ batch = ip.preprocess_batch([chunk], src_lang=src_lang, tgt_lang=tgt_lang)
101
+ inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to(DEVICE)
102
+ with torch.no_grad():
103
+ generated_tokens = translation_model.generate(
104
+ **inputs, use_cache=False, min_length=0, max_length=256, num_beams=5, num_return_sequences=1,
105
+ )
106
+ decoded = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
107
+ translations = ip.postprocess_batch(decoded, lang=tgt_lang)
108
+ return translations[0]
109
+
110
+ def summarize_with_gemini(text_to_summarize, api_key):
111
+ if not api_key:
112
+ raise gr.Error("Gemini API key is not configured. Please set it in the Space Secrets.")
113
+
114
+ print("\nAnalyzing the document with Gemini...")
115
+ model = 'gemini-1.5-flash-latest' # Updated to a more recent model
116
+ api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
117
+
118
+ prompt = f"""
119
+ You are an expert AI assistant for KMRL (Kochi Metro Rail Limited) document management.
120
+ You have been given the following document. Your task is to analyze it and extract key information.
121
+
122
+ **Document Content:**
123
+ ---
124
+ {text_to_summarize}
125
+ ---
126
+
127
+ Based on the document, perform the following actions and provide the output in a valid JSON format:
128
+ 1. Summarize the document in 2-3 concise sentences highlighting key points.
129
+ 2. Identify specific actions required. For each action, detect any timeline/deadline, assign a priority ("High", "Medium", or "Low"), and add brief notes for traceability.
130
+ 3. Suggest a list of departments that should be notified.
131
+ 4. Detect if this document references or relates to previous incidents, maintenance logs, or similar documents and flag any recurring issues.
132
+ """
133
+
134
+ json_schema = {
135
+ "type": "OBJECT", "properties": {"summary": {"type": "STRING"}, "actions_required": {"type": "ARRAY", "items": {"type": "OBJECT", "properties": {"action": {"type": "STRING"}, "priority": {"type": "STRING", "enum": ["High", "Medium", "Low"]}, "deadline": {"type": "STRING"}, "notes": {"type": "STRING"}}, "required": ["action", "priority", "deadline", "notes"]}}, "departments_to_notify": {"type": "ARRAY", "items": {"type": "STRING"}}, "cross_document_flags": {"type": "ARRAY", "items": {"type": "OBJECT", "properties": {"related_document_type": {"type": "STRING"}, "related_issue": {"type": "STRING"}}, "required": ["related_document_type", "related_issue"]}}}, "required": ["summary", "actions_required", "departments_to_notify", "cross_document_flags"]
136
+ }
137
+
138
+ payload = {
139
+ "contents": [{"parts": [{"text": prompt}]}], "generationConfig": {"responseMimeType": "application/json", "responseSchema": json_schema}
140
+ }
141
+
142
+ try:
143
+ response = requests.post(api_url, headers={"Content-Type": "application/json"}, json=payload)
144
+ response.raise_for_status()
145
+ result = response.json()
146
+ if 'candidates' in result and result['candidates']:
147
+ json_text = result['candidates'][0]['content']['parts'][0]['text']
148
+ print("✅ Gemini analysis successful.")
149
+ return json.loads(json_text)
150
+ else:
151
+ raise gr.Error(f"Gemini API returned an invalid response: {response.text}")
152
+ except requests.exceptions.RequestException as e:
153
+ raise gr.Error(f"Network error during Gemini API call: {e}")
154
+ except Exception as e:
155
+ raise gr.Error(f"An unexpected error occurred during analysis: {e}")
156
+
157
+
158
+ # --- MAIN PROCESSING FUNCTION ---
159
+ def process_and_analyze_document(input_file):
160
+ if input_file is None:
161
+ raise gr.Error("No file uploaded. Please upload a document.")
162
+
163
+ input_file_path = input_file.name # Gradio provides a temporary file object
164
+ print(f"Processing file: {input_file_path}")
165
+
166
+ ext = os.path.splitext(input_file_path)[1].lower()
167
+ if ext == ".pdf":
168
+ original_text = extract_text_from_pdf(input_file_path)
169
+ elif ext == ".txt":
170
+ original_text = read_text_from_txt(input_file_path)
171
+ elif ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
172
+ original_text = extract_text_from_image(input_file_path)
173
+ else:
174
+ raise gr.Error("Unsupported file type. Please upload a .pdf, .txt, or image file.")
175
+
176
+ if not original_text or not original_text.strip():
177
+ raise gr.Error("Could not extract any text from the document.")
178
+
179
+ lines = original_text.split("\n")
180
+ translated_lines = []
181
+
182
+ # Process line by line for translation
183
+ for i, ln in enumerate(lines):
184
+ if not ln.strip():
185
+ translated_lines.append("")
186
+ continue
187
+
188
+ lang = detect_language(ln)
189
+ # Check for Malayalam ('ml')
190
+ if lang == LANGUAGE_TO_TRANSLATE:
191
+ print(f" -> Translating chunk {i+1} (Malayalam)...")
192
+ translated = translate_chunk(ln, src_lang="mal_Mlym", tgt_lang="eng_Latn")
193
+ translated_lines.append(translated)
194
+ else:
195
+ translated_lines.append(ln) # Keep non-Malayalam text as-is
196
+
197
+ translated_text = "\n".join(translated_lines)
198
+
199
+ if not translated_text.strip():
200
+ raise gr.Error("The document was empty after translation.")
201
+
202
+ # Final step: analyze with Gemini
203
+ analysis_json = summarize_with_gemini(translated_text, GEMINI_API_KEY)
204
+
205
+ return analysis_json
206
+
207
+ # --- GRADIO INTERFACE ---
208
+ iface = gr.Interface(
209
+ fn=process_and_analyze_document,
210
+ inputs=gr.File(label="Upload Document (.pdf, .txt, .png, .jpeg)"),
211
+ outputs=gr.JSON(label="Analysis Result"),
212
+ title="KMRL Document Analysis Pipeline",
213
+ description="Upload a document (Malayalam or English). The system will detect and translate Malayalam text to English, then send the full text to Gemini for structured analysis.",
214
+ allow_flagging="never",
215
+ examples=[
216
+ ["Malayalam-en.txt"] # If you upload this file to your Space
217
+ ]
218
+ )
219
+
220
+ if __name__ == "__main__":
221
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ huggingface_hub
3
+ fasttext
4
+ pymupdf
5
+ pillow
6
+ torch
7
+ sentencepiece
8
+ IndicTransToolkit
9
+ gradio
10
+ requests
11
+ accelerate