asad9641 commited on
Commit
1b0ed47
Β·
verified Β·
1 Parent(s): 17fe773

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +283 -0
app.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import tempfile
4
+ import requests
5
+ from pathlib import Path
6
+ from dotenv import load_dotenv
7
+ from gtts import gTTS
8
+ from PyPDF2 import PdfReader
9
+ from PIL import Image
10
+ import gradio as gr
11
+ from googletrans import Translator
12
+ from sentence_transformers import SentenceTransformer, util
13
+
14
+ # ------------------ Load API Keys ------------------
15
+ load_dotenv()
16
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
17
+ OCR_SPACE_API_KEY = os.getenv("OCR_SPACE_API_KEY", "").strip()
18
+
19
+ if not GROQ_API_KEY:
20
+ raise ValueError("❌ GROQ_API_KEY missing. Add it in Hugging Face Secrets.")
21
+ if not OCR_SPACE_API_KEY:
22
+ raise ValueError("❌ OCR_SPACE_API_KEY missing. Add it in Hugging Face Secrets.")
23
+
24
+ HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}"}
25
+
26
+ # ------------------ Global States ------------------
27
+ SESSION_HISTORY = {}
28
+ PDF_CONTENT = {} # session_id -> list of chunks
29
+ PDF_EMBEDS = {} # session_id -> list of embeddings
30
+ IMAGE_TEXT = {}
31
+ IMAGE_EMBEDS = {}
32
+ CHUNK_SIZE = 1500 # Number of characters per chunk
33
+
34
+ translator = Translator()
35
+ embed_model = SentenceTransformer('all-MiniLM-L6-v2')
36
+
37
+ # ------------------ Utility Functions ------------------
38
+ def chunk_text(text, size=CHUNK_SIZE):
39
+ return [text[i:i+size] for i in range(0, len(text), size)]
40
+
41
+ def synthesize_speech(text, lang="en"):
42
+ try:
43
+ tts = gTTS(text=text, lang=lang)
44
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
45
+ tts.save(temp_file.name)
46
+ return temp_file.name
47
+ except Exception as e:
48
+ print("TTS error:", e)
49
+ return None
50
+
51
+ def select_relevant_chunk(question, chunks, chunk_embeds):
52
+ q_embed = embed_model.encode(question, convert_to_tensor=True)
53
+ scores = util.cos_sim(q_embed, chunk_embeds)[0]
54
+ top_idx = scores.argmax().item()
55
+ return chunks[top_idx]
56
+
57
+ # ------------------ Voice Chat ------------------
58
+ def transcribe_audio(audio_file):
59
+ try:
60
+ url = "https://api.groq.com/openai/v1/audio/transcriptions"
61
+ with open(audio_file, "rb") as f:
62
+ files = {"file": ("audio.wav", f, "audio/wav")}
63
+ data = {"model": "whisper-large-v3"}
64
+ resp = requests.post(url, headers=HEADERS, files=files, data=data)
65
+ resp.raise_for_status()
66
+ return resp.json().get("text", "")
67
+ except Exception as e:
68
+ return f"Error transcribing audio: {e}"
69
+
70
+ def generate_response(session_id, user_text):
71
+ if session_id not in SESSION_HISTORY:
72
+ SESSION_HISTORY[session_id] = []
73
+ SESSION_HISTORY[session_id].append({"role": "user", "content": user_text})
74
+ messages = [{"role": "system", "content": "You are a helpful AI assistant."}] + SESSION_HISTORY[session_id]
75
+ body = {"model": "llama-3.1-8b-instant", "messages": messages}
76
+ try:
77
+ resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body)
78
+ resp.raise_for_status()
79
+ assistant_msg = resp.json()["choices"][0]["message"]["content"]
80
+ SESSION_HISTORY[session_id].append({"role": "assistant", "content": assistant_msg})
81
+ return assistant_msg
82
+ except Exception as e:
83
+ return f"Error generating response: {e}"
84
+
85
+ def handle_voice(audio_file, session_id, tts_lang="en"):
86
+ if not audio_file:
87
+ return [], None
88
+ user_text = transcribe_audio(audio_file)
89
+
90
+ # Translate if needed
91
+ translated_text = user_text
92
+ if tts_lang != "en":
93
+ translated_text = translator.translate(user_text, src=tts_lang, dest="en").text
94
+
95
+ assistant_text = generate_response(session_id, translated_text)
96
+
97
+ # Translate back for TTS
98
+ tts_text = assistant_text
99
+ if tts_lang != "en":
100
+ tts_text = translator.translate(assistant_text, src="en", dest=tts_lang).text
101
+
102
+ audio_path = synthesize_speech(tts_text, lang=tts_lang)
103
+ return SESSION_HISTORY[session_id], audio_path
104
+
105
+ def reset_voice():
106
+ new_id = str(uuid.uuid4())
107
+ SESSION_HISTORY[new_id] = []
108
+ return new_id, []
109
+
110
+ # ------------------ PDF Handling ------------------
111
+ def handle_pdf_upload(pdf_file, session_id):
112
+ if not pdf_file:
113
+ return "", "No file uploaded"
114
+ try:
115
+ reader = PdfReader(pdf_file.name)
116
+ text = ""
117
+ for page in reader.pages:
118
+ text += page.extract_text() or ""
119
+ if not text.strip():
120
+ return "", "No extractable content found in PDF."
121
+ chunks = chunk_text(text)
122
+ PDF_CONTENT[session_id] = chunks
123
+ PDF_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
124
+ return "PDF uploaded successfully!", ""
125
+ except Exception as e:
126
+ return "", f"Error processing PDF: {e}"
127
+
128
+ def handle_pdf_question(question, session_id):
129
+ if session_id not in PDF_CONTENT:
130
+ return "Document not found. Please upload first."
131
+ chunk = select_relevant_chunk(question, PDF_CONTENT[session_id], PDF_EMBEDS[session_id])
132
+ messages = [
133
+ {"role": "system", "content": "You are a helpful assistant summarizing the PDF."},
134
+ {"role": "user", "content": f"PDF Content: {chunk} ... Question: {question}"}
135
+ ]
136
+ body = {"model": "llama-3.1-8b-instant", "messages": messages}
137
+ try:
138
+ resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body)
139
+ resp.raise_for_status()
140
+ return resp.json()["choices"][0]["message"]["content"]
141
+ except Exception as e:
142
+ return f"Error generating response: {e}"
143
+
144
+ def handle_pdf_question_voice(audio_file, session_id, tts_lang="en"):
145
+ if not audio_file:
146
+ return "", None
147
+ question = transcribe_audio(audio_file)
148
+
149
+ # Translate if needed
150
+ translated_question = question
151
+ if tts_lang != "en":
152
+ translated_question = translator.translate(question, src=tts_lang, dest="en").text
153
+
154
+ # Select relevant chunk
155
+ if session_id not in PDF_CONTENT:
156
+ answer = "No PDF uploaded. Please upload first."
157
+ else:
158
+ chunk = select_relevant_chunk(translated_question, PDF_CONTENT[session_id], PDF_EMBEDS[session_id])
159
+ messages = [
160
+ {"role": "system", "content": "You are a helpful assistant summarizing the PDF."},
161
+ {"role": "user", "content": f"PDF Content: {chunk} ... Question: {translated_question}"}
162
+ ]
163
+ body = {"model": "llama-3.1-8b-instant", "messages": messages}
164
+ try:
165
+ resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body)
166
+ resp.raise_for_status()
167
+ answer = resp.json()["choices"][0]["message"]["content"]
168
+ except Exception as e:
169
+ answer = f"Error generating response: {e}"
170
+
171
+ # Translate back for TTS
172
+ tts_text = answer
173
+ if tts_lang != "en":
174
+ tts_text = translator.translate(answer, src="en", dest=tts_lang).text
175
+ audio_path = synthesize_speech(tts_text, lang=tts_lang)
176
+ return answer, audio_path
177
+
178
+ def download_pdf_summary(session_id):
179
+ if session_id not in SESSION_HISTORY:
180
+ return None
181
+ summary = "\n".join([msg["content"] for msg in SESSION_HISTORY[session_id] if msg["role"]=="assistant"])
182
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
183
+ with open(temp_file.name, "w", encoding="utf-8") as f:
184
+ f.write(summary)
185
+ return temp_file.name
186
+
187
+ # ------------------ Image OCR via OCR.Space ------------------
188
+ def handle_image_upload_ocr(image_file, session_id):
189
+ if not image_file:
190
+ return None, "No image uploaded"
191
+ try:
192
+ with open(image_file.name, "rb") as f:
193
+ response = requests.post(
194
+ 'https://api.ocr.space/parse/image',
195
+ files={'file': f},
196
+ data={'apikey': OCR_SPACE_API_KEY, 'language': 'eng'}
197
+ )
198
+ result = response.json()
199
+ parsed_text = result['ParsedResults'][0]['ParsedText'] if result['ParsedResults'] else ""
200
+ if not parsed_text.strip():
201
+ return None, "No extractable text found in the image."
202
+ chunks = chunk_text(parsed_text)
203
+ IMAGE_TEXT[session_id] = chunks
204
+ IMAGE_EMBEDS[session_id] = embed_model.encode(chunks, convert_to_tensor=True)
205
+ return "Image uploaded successfully!", None
206
+ except Exception as e:
207
+ return None, f"Error reading image: {e}"
208
+
209
+ def handle_image_question(question, session_id):
210
+ if session_id not in IMAGE_TEXT:
211
+ return "Image not found. Please upload first."
212
+ chunk = select_relevant_chunk(question, IMAGE_TEXT[session_id], IMAGE_EMBEDS[session_id])
213
+ messages = [
214
+ {"role": "system", "content": "You are a helpful assistant summarizing image text."},
215
+ {"role": "user", "content": f"Image Text: {chunk} ... Question: {question}"}
216
+ ]
217
+ body = {"model": "llama-3.1-8b-instant", "messages": messages}
218
+ try:
219
+ resp = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=HEADERS, json=body)
220
+ resp.raise_for_status()
221
+ return resp.json()["choices"][0]["message"]["content"]
222
+ except Exception as e:
223
+ return f"Error generating response: {e}"
224
+
225
+ # ------------------ Gradio UI ------------------
226
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
227
+ gr.Markdown("## πŸ›  Multi-Mode AI Assistant (Voice, PDF, Image)")
228
+
229
+ session_voice = gr.State(str(uuid.uuid4()))
230
+ session_pdf = gr.State(str(uuid.uuid4()))
231
+ session_image = gr.State(str(uuid.uuid4()))
232
+
233
+ # --- Voice ---
234
+ with gr.Tab("🎀 Voice Chat"):
235
+ chat_voice = gr.Chatbot(type="messages", height=380)
236
+ with gr.Row():
237
+ mic = gr.Audio(type="filepath", label="Hold & speak")
238
+ tts_lang = gr.Dropdown(choices=["en", "ur"], value="en", label="Voice Language")
239
+ send_voice = gr.Button("Send")
240
+ audio_output = gr.Audio(label="Assistant Voice Output", type="filepath")
241
+ reset_v = gr.Button("β™» Reset Voice Chat")
242
+ send_voice.click(handle_voice, inputs=[mic, session_voice, tts_lang], outputs=[chat_voice, audio_output])
243
+ reset_v.click(reset_voice, outputs=[session_voice, chat_voice])
244
+
245
+ # --- PDF (Text) ---
246
+ with gr.Tab("πŸ“„ PDF Summarizer"):
247
+ pdf_output = gr.Textbox(label="Answer (Text Only)", lines=20, max_lines=40)
248
+ pdf_upload_btn = gr.File(label="Upload PDF", file_types=[".pdf"])
249
+ pdf_question = gr.Textbox(label="Ask a question about PDF", lines=2)
250
+ pdf_send_btn = gr.Button("Ask")
251
+ pdf_reset_btn = gr.Button("β™» Reset PDF")
252
+ pdf_download_btn = gr.Button("πŸ“₯ Download Summary")
253
+ pdf_upload_btn.upload(handle_pdf_upload, inputs=[pdf_upload_btn, session_pdf], outputs=[pdf_output, pdf_output])
254
+ pdf_send_btn.click(handle_pdf_question, inputs=[pdf_question, session_pdf], outputs=[pdf_output])
255
+ pdf_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_pdf, pdf_output])
256
+ pdf_download_btn.click(download_pdf_summary, inputs=[session_pdf], outputs=[pdf_output])
257
+
258
+ # --- PDF Voice Question ---
259
+ with gr.Tab("πŸ“„ PDF Voice Question"):
260
+ pdf_voice_chat = gr.Textbox(label="Assistant Answer", lines=10)
261
+ pdf_voice_audio = gr.Audio(label="Assistant Voice Output", type="filepath")
262
+ pdf_voice_input = gr.Audio(type="filepath", label="Hold & speak PDF question")
263
+ pdf_voice_lang = gr.Dropdown(choices=["en","ur"], value="en", label="Voice Language")
264
+ pdf_voice_btn = gr.Button("Ask via Voice")
265
+ pdf_voice_btn.click(
266
+ handle_pdf_question_voice,
267
+ inputs=[pdf_voice_input, session_pdf, pdf_voice_lang],
268
+ outputs=[pdf_voice_chat, pdf_voice_audio]
269
+ )
270
+
271
+ # --- Image ---
272
+ with gr.Tab("πŸ–Ό Image OCR"):
273
+ image_output = gr.Textbox(label="Answer (Text Only)", lines=20, max_lines=40)
274
+ image_upload_btn = gr.File(label="Upload Image", file_types=[".png", ".jpg", ".jpeg"])
275
+ image_question = gr.Textbox(label="Ask a question about Image", lines=2)
276
+ image_send_btn = gr.Button("Ask")
277
+ image_reset_btn = gr.Button("β™» Reset Image")
278
+ image_upload_btn.upload(handle_image_upload_ocr, inputs=[image_upload_btn, session_image], outputs=[image_output, image_output])
279
+ image_send_btn.click(handle_image_question, inputs=[image_question, session_image], outputs=[image_output])
280
+ image_reset_btn.click(lambda: (str(uuid.uuid4()), ""), outputs=[session_image, image_output])
281
+
282
+ if __name__ == "__main__":
283
+ demo.launch()