Sheikh Mohammad Rakib commited on
Commit
c19f82c
Β·
1 Parent(s): a9ffeba

refactor: remove custom retry logic and progress tracking in favor of direct requests with status logging

Browse files
Files changed (1) hide show
  1. app.py +116 -215
app.py CHANGED
@@ -1,9 +1,8 @@
1
  import gradio as gr
2
- import spaces
3
  import requests
4
  import uuid
5
  import base64
6
- import time
7
  from pathlib import Path
8
 
9
  # ── CONFIG ────────────────────────────────────────────────────────────────────
@@ -22,213 +21,117 @@ def encode_file(path):
22
  return base64.b64encode(f.read()).decode()
23
 
24
 
25
- def modal_post(url, payload, timeout=300, retries=3, label="endpoint"):
26
- """
27
- POST to a Modal endpoint with retry logic for cold-start empty responses.
28
- Returns (dict_result, error_string). One of them will be None.
29
- """
30
- last_err = None
31
- for attempt in range(1, retries + 1):
32
- try:
33
- r = requests.post(url, json=payload, timeout=timeout)
34
-
35
- # Empty body = Modal cold-starting or gateway hiccup β€” retry
36
- if not r.text or not r.text.strip():
37
- wait = attempt * 10
38
- last_err = f"{label} returned HTTP {r.status_code} with empty body (attempt {attempt}/{retries})"
39
- if attempt < retries:
40
- time.sleep(wait)
41
- continue
42
- return None, last_err
43
-
44
- try:
45
- return r.json(), None
46
- except Exception:
47
- import json as _json
48
- raw = r.text.strip()
49
- try:
50
- obj, _ = _json.JSONDecoder().raw_decode(raw)
51
- return obj, None
52
- except Exception:
53
- body = raw[:500]
54
- return None, f"{label} HTTP {r.status_code}: {body}"
55
-
56
- except requests.exceptions.Timeout:
57
- last_err = f"{label} timed out after {timeout}s (attempt {attempt}/{retries})"
58
- if attempt < retries:
59
- time.sleep(5)
60
- continue
61
- except requests.exceptions.ConnectionError as e:
62
- last_err = f"{label} connection error: {e}"
63
- if attempt < retries:
64
- time.sleep(10)
65
- continue
66
- except Exception as e:
67
- return None, f"{label} unexpected error: {e}"
68
-
69
- return None, last_err
70
-
71
-
72
- def modal_get(url, timeout=90, retries=2, label="endpoint"):
73
- """GET from a Modal endpoint with retry."""
74
- last_err = None
75
- for attempt in range(1, retries + 1):
76
- try:
77
- r = requests.get(url, timeout=timeout)
78
- if not r.text or not r.text.strip():
79
- last_err = f"{label} empty response (attempt {attempt}/{retries})"
80
- if attempt < retries:
81
- time.sleep(15)
82
- continue
83
- return None, last_err
84
- try:
85
- return r.json(), None
86
- except Exception:
87
- import json as _json
88
- raw = r.text.strip()
89
- try:
90
- obj, _ = _json.JSONDecoder().raw_decode(raw)
91
- return obj, None
92
- except Exception:
93
- return None, f"{label} HTTP {r.status_code}: {raw[:300]}"
94
- except Exception as e:
95
- last_err = f"{label} error: {e}"
96
- if attempt < retries:
97
- time.sleep(10)
98
- return None, last_err
99
-
100
-
101
- def _safe_json(r, label):
102
- """Parse JSON from a response, tolerating extra data after the first object."""
103
- import json as _json
104
- try:
105
- return r.json(), None
106
- except Exception:
107
- raw = r.text.strip()
108
- try:
109
- obj, _ = _json.JSONDecoder().raw_decode(raw)
110
- return obj, None
111
- except Exception:
112
- return None, f"{label} HTTP {r.status_code}: {raw[:500]}"
113
-
114
-
115
- @spaces.GPU
116
- def build_persona(name, relationship, text_input, photo_captions, voice_file, photo_files, scanned_files,
117
- progress=gr.Progress(track_tqdm=False)):
118
  if not name.strip():
119
  return "❌ Please enter the person's name.", None, gr.update()
120
 
121
  texts = [t.strip() for t in text_input.strip().split("---") if t.strip()] if text_input.strip() else []
122
  captions = [c.strip() for c in photo_captions.strip().split("\n") if c.strip()] if photo_captions.strip() else []
123
  voice_transcripts = []
 
 
 
124
 
125
  if not texts and not captions and voice_file is None and not photo_files and not scanned_files:
126
  return "❌ Please provide at least one input.", None, gr.update()
127
 
128
- # Work out how many steps we actually have so the bar fills evenly
129
- n_photos = len(photo_files) if photo_files else 0
130
- n_scans = len(scanned_files) if scanned_files else 0
131
- has_voice = voice_file is not None
132
- total_steps = 1 + int(has_voice) + n_photos + n_scans # 1 = build-persona
133
- done = 0
134
-
135
- def advance(msg):
136
- nonlocal done
137
- done += 1
138
- progress(done / total_steps, desc=msg)
139
-
140
- progress(0, desc="πŸ•―οΈ Starting…")
141
-
142
  # 1. Transcribe voice note (Cohere ASR)
143
- if has_voice:
144
- progress(done / total_steps, desc="🎀 Transcribing voice note…")
145
- result, err = modal_post(
146
- TRANSCRIBE_URL,
147
- {"audio_b64": encode_file(voice_file), "filename": Path(voice_file).name},
148
- timeout=180, label="transcribe"
149
- )
150
- if err:
151
- return f"❌ Voice transcription failed: {err}", None, gr.update()
152
- transcript = result.get("transcript", "")
153
- if transcript:
154
- voice_transcripts.append(transcript)
155
- advance("🎀 Voice transcribed")
 
 
 
 
 
156
 
157
  # 2. Describe uploaded photos (MiniCPM-V)
158
- for i, photo in enumerate(photo_files or []):
159
- progress(done / total_steps, desc=f"πŸ‘οΈ Analysing photo {i+1}/{n_photos}…")
160
- result, err = modal_post(
161
- VISION_URL, {"image_b64": encode_file(photo)},
162
- timeout=180, label="vision"
163
- )
164
- if err:
165
- captions.append(f"[Photo description failed: {err}]")
166
- else:
167
- desc = result.get("description", "")
168
- if desc:
169
- captions.append(desc)
170
- advance(f"πŸ‘οΈ Photo {i+1} analysed")
 
 
 
171
 
172
  # 3. OCR scanned letters (Nemotron Parse)
173
- for i, scan in enumerate(scanned_files or []):
174
- progress(done / total_steps, desc=f"πŸ“„ OCR scan {i+1}/{n_scans}…")
175
- result, err = modal_post(
176
- OCR_URL, {"image_b64": encode_file(scan)},
177
- timeout=180, label="ocr"
178
- )
179
- if err:
180
- texts.append(f"[OCR failed: {err}]")
181
- else:
182
- ocr_text = result.get("text", "")
183
- if ocr_text:
184
- texts.append(ocr_text)
185
- advance(f"πŸ“„ Scan {i+1} done")
186
-
187
- # 4. Build persona (Qwen 32B) β€” long timeout, more retries for cold start
188
- progress(done / total_steps, desc="🧠 Building persona β€” this may take 1–3 min…")
 
 
 
 
 
 
 
189
  persona_id = str(uuid.uuid4())[:8]
190
- result, err = modal_post(
191
- BUILD_PERSONA_URL,
192
- {
193
  "persona_id": persona_id, "name": name.strip(),
194
  "relationship": relationship.strip(),
195
  "texts": texts, "photo_captions": captions,
196
  "voice_transcripts": voice_transcripts,
197
- },
198
- timeout=1200, retries=3, label="build-persona"
199
- )
200
-
201
- if err:
202
- return f"❌ {err}", None, gr.update()
203
-
204
- # Tolerate extra data in the JSON response
205
- if isinstance(result, str):
206
- import json as _json
207
- try:
208
- result, _ = _json.JSONDecoder().raw_decode(result.strip())
209
- except Exception:
210
- return f"❌ Backend error: could not parse response", None, gr.update()
211
-
212
- progress(1.0, desc="βœ… Done!")
213
-
214
- if result.get("success"):
215
- persona = result["persona"]
216
- summary = f"""βœ… **{name}'s memory has been preserved.**
217
 
218
  **Persona ID:** `{persona_id}`
219
  **Personality:** {', '.join(persona.get('personality_traits', [])[:3])}
220
  **Language:** {persona.get('language', 'Auto')}
221
  **Memories captured:** {len(persona.get('key_memories', []))}
222
- **Voice style:** {persona.get('voice_description', 'N/A')}
223
 
224
  Go to the **πŸ’¬ Talk** tab and enter the Persona ID."""
225
- return summary, persona_id, gr.update(value=persona_id)
226
- else:
227
- error_detail = result.get("error", str(result))
228
- return f"❌ Backend error: {error_detail}", None, gr.update()
 
 
 
 
 
 
 
 
229
 
230
 
231
- @spaces.GPU
232
  def chat_with_persona(persona_id, message, history, language, enable_voice):
233
  history = history or []
234
 
@@ -239,38 +142,34 @@ def chat_with_persona(persona_id, message, history, language, enable_voice):
239
  if not message.strip():
240
  return "", history, None
241
 
242
- result, err = modal_post(
243
- CHAT_URL,
244
- {
245
  "persona_id": persona_id.strip(),
246
  "history": [{"role": m["role"], "content": m["content"]} for m in history],
247
  "message": message.strip(),
248
  "language": language,
249
- },
250
- timeout=180, retries=2, label="chat"
251
- )
252
-
253
- if err:
254
- response_text = f"⚠️ {err}"
255
- voice_desc = "warm elderly voice"
256
- else:
257
  response_text = result.get("text", result.get("response", "..."))
258
  voice_desc = result.get("voice_description", "warm elderly voice")
 
 
 
259
 
260
  history = history + [
261
  {"role": "user", "content": message},
262
  {"role": "assistant", "content": response_text},
263
  ]
264
 
265
- # Generate voice response (VoxCPM2) β€” skip if response is an error
266
  audio_path = None
267
- if enable_voice and not response_text.startswith("⚠️"):
268
  try:
269
  r = requests.post(TTS_URL, json={
270
  "text": response_text,
271
  "voice_description": voice_desc,
272
  }, timeout=180)
273
- if r.status_code == 200 and r.content:
274
  import tempfile
275
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
276
  f.write(r.content)
@@ -282,14 +181,18 @@ def chat_with_persona(persona_id, message, history, language, enable_voice):
282
 
283
 
284
  def load_personas():
285
- result, err = modal_get(LIST_PERSONAS_URL, timeout=90, retries=2, label="list-personas")
286
- if err:
287
- return f"⚠️ {err}\n\nModal may be waking up β€” try again in 30 seconds."
288
- personas = result.get("personas", [])
289
- if not personas:
290
- return "No personas saved yet."
291
- lines = [f"**{p['name']}** ({p['relationship']}) β€” ID: `{p['id']}`" for p in personas]
292
- return "\n\n".join(lines)
 
 
 
 
293
 
294
 
295
  # ── UI ────────────────────────────────────────────────────────────────────────
@@ -315,14 +218,12 @@ with gr.Blocks(title="Memory Keeper") as demo:
315
  <div class="header-sub">Preserve the voice of someone you love. Talk to them again.</div>
316
  <hr class="divider">
317
  <div style="text-align:center; margin-bottom:16px;">
318
- <span class="model-badge">🧠 MiniCPM4.1-8B (8B)</span>
319
- <span class="model-badge">πŸ‘οΈ MiniCPM-V 4.6 (8B)</span>
320
- <span class="model-badge">🎀 Cohere Transcribe (2B)</span>
321
- <span class="model-badge">πŸ“„ Nemotron Parse v1.2 (&lt;1B)</span>
322
- <span class="model-badge">🌍 Tiny Aya Fire (3.35B)</span>
323
- <span class="model-badge">🌊 Tiny Aya Water (3.35B)</span>
324
- <span class="model-badge">πŸ”Š VoxCPM2 (~1B)</span>
325
- <span class="model-badge">Total: ~26.7B params</span>
326
  </div>
327
  """)
328
 
@@ -333,12 +234,12 @@ with gr.Blocks(title="Memory Keeper") as demo:
333
  gr.HTML("<p style='color:#8a7560; font-style:italic; margin-bottom:16px;'>Upload letters, photos, voice notes, or scanned documents. Each is processed by a specialized AI model.</p>")
334
 
335
  with gr.Row():
336
- name_input = gr.Textbox(label="Their Name", value="Dadu")
337
- relationship_input = gr.Textbox(label="Your Relationship", value="Grandfather")
338
 
339
  text_input = gr.Textbox(
340
  label="πŸ“ Letters / Diary Entries / Writings",
341
- value="Dear family,\n\nWork hard in your studies. Knowledge is the one thing no one can take from you. When I was young, I walked four miles to school every day β€” no shoes, no complaints. We were poor but we were honest.\n\nAlways be kind to your mother. She sacrifices more than you will ever know.\n\nSave your money. A good name lasts longer than any wealth.\n\nI am proud of all of you. Your Dadu misses your faces.\n---\nBeta, I heard you got good marks. This makes my heart so happy. Keep going.",
342
  lines=6,
343
  )
344
 
@@ -354,7 +255,7 @@ with gr.Blocks(title="Memory Keeper") as demo:
354
 
355
  photo_captions = gr.Textbox(
356
  label="πŸ–ΌοΈ Manual Photo Captions (optional, one per line)",
357
- value="Old black and white photo, Dadu sitting on a wooden chair, wearing white panjabi, smiling warmly\nEid gathering, Dadu surrounded by grandchildren, laughing out loud\nDadu in the garden early morning, watering plants carefully",
358
  lines=3,
359
  )
360
 
@@ -390,7 +291,7 @@ with gr.Blocks(title="Memory Keeper") as demo:
390
  chatbot = gr.Chatbot(label="", height=420, placeholder="*Their words will appear here...*")
391
 
392
  with gr.Row():
393
- msg_input = gr.Textbox(label="Your message", value="What is the most important lesson in life?", lines=2, scale=4)
394
  send_btn = gr.Button("Send β†’", variant="primary", scale=1)
395
 
396
  voice_output = gr.Audio(label="πŸ”Š Voice Response", visible=True, autoplay=True)
 
1
  import gradio as gr
 
2
  import requests
3
  import uuid
4
  import base64
5
+ import json
6
  from pathlib import Path
7
 
8
  # ── CONFIG ────────────────────────────────────────────────────────────────────
 
21
  return base64.b64encode(f.read()).decode()
22
 
23
 
24
+ def build_persona(name, relationship, text_input, photo_captions, voice_file, photo_files, scanned_files):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  if not name.strip():
26
  return "❌ Please enter the person's name.", None, gr.update()
27
 
28
  texts = [t.strip() for t in text_input.strip().split("---") if t.strip()] if text_input.strip() else []
29
  captions = [c.strip() for c in photo_captions.strip().split("\n") if c.strip()] if photo_captions.strip() else []
30
  voice_transcripts = []
31
+
32
+ # We will build a step-by-step log to show the user exactly what succeeded/failed
33
+ status_log = []
34
 
35
  if not texts and not captions and voice_file is None and not photo_files and not scanned_files:
36
  return "❌ Please provide at least one input.", None, gr.update()
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # 1. Transcribe voice note (Cohere ASR)
39
+ if voice_file is not None:
40
+ try:
41
+ r = requests.post(TRANSCRIBE_URL, json={
42
+ "audio_b64": encode_file(voice_file),
43
+ "filename": Path(voice_file).name,
44
+ }, timeout=180)
45
+
46
+ if r.status_code == 200:
47
+ transcript = r.json().get("transcript", "")
48
+ if transcript:
49
+ voice_transcripts.append(transcript)
50
+ status_log.append("βœ… Voice note transcribed successfully.")
51
+ else:
52
+ status_log.append("⚠️ Voice note processed, but no text was found.")
53
+ else:
54
+ status_log.append(f"❌ Voice transcription failed (HTTP {r.status_code}): {r.text}")
55
+ except Exception as e:
56
+ status_log.append(f"❌ Voice transcription failed: {e}")
57
 
58
  # 2. Describe uploaded photos (MiniCPM-V)
59
+ if photo_files:
60
+ success_count = 0
61
+ for i, photo in enumerate(photo_files):
62
+ try:
63
+ r = requests.post(VISION_URL, json={"image_b64": encode_file(photo)}, timeout=180)
64
+ if r.status_code == 200:
65
+ desc = r.json().get("description", "")
66
+ if desc:
67
+ captions.append(desc)
68
+ success_count += 1
69
+ else:
70
+ status_log.append(f"❌ Photo {i+1} description failed (HTTP {r.status_code}).")
71
+ except Exception as e:
72
+ status_log.append(f"❌ Photo {i+1} description failed: {e}")
73
+ if success_count > 0:
74
+ status_log.append(f"βœ… {success_count}/{len(photo_files)} photos described successfully.")
75
 
76
  # 3. OCR scanned letters (Nemotron Parse)
77
+ if scanned_files:
78
+ success_count = 0
79
+ for i, scan in enumerate(scanned_files):
80
+ try:
81
+ r = requests.post(OCR_URL, json={"image_b64": encode_file(scan)}, timeout=180)
82
+ if r.status_code == 200:
83
+ ocr_text = r.json().get("text", "")
84
+ if ocr_text:
85
+ texts.append(ocr_text)
86
+ success_count += 1
87
+ else:
88
+ status_log.append(f"❌ Scan {i+1} OCR failed (HTTP {r.status_code}).")
89
+ except Exception as e:
90
+ status_log.append(f"❌ Scan {i+1} OCR failed: {e}")
91
+ if success_count > 0:
92
+ status_log.append(f"βœ… {success_count}/{len(scanned_files)} scanned documents read successfully.")
93
+
94
+ # Check if we have AT LEAST SOME data to build the persona
95
+ if not texts and not captions and not voice_transcripts:
96
+ status_log.append("\n❌ **ABORTED:** All AI processing failed, and no manual text/captions were provided. Cannot build persona.")
97
+ return "\n\n".join(status_log), None, gr.update()
98
+
99
+ # 4. Build persona (Qwen 32B)
100
  persona_id = str(uuid.uuid4())[:8]
101
+ try:
102
+ r = requests.post(BUILD_PERSONA_URL, json={
 
103
  "persona_id": persona_id, "name": name.strip(),
104
  "relationship": relationship.strip(),
105
  "texts": texts, "photo_captions": captions,
106
  "voice_transcripts": voice_transcripts,
107
+ }, timeout=1200)
108
+
109
+ if r.status_code == 200:
110
+ result = r.json()
111
+ if result.get("success"):
112
+ persona = result["persona"]
113
+ summary = f"""\nπŸŽ‰ **{name}'s memory has been successfully preserved!**
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  **Persona ID:** `{persona_id}`
116
  **Personality:** {', '.join(persona.get('personality_traits', [])[:3])}
117
  **Language:** {persona.get('language', 'Auto')}
118
  **Memories captured:** {len(persona.get('key_memories', []))}
 
119
 
120
  Go to the **πŸ’¬ Talk** tab and enter the Persona ID."""
121
+ status_log.append(summary)
122
+ return "\n".join(status_log), persona_id, gr.update(value=persona_id)
123
+ else:
124
+ status_log.append(f"\n❌ Persona builder failed: {result}")
125
+ else:
126
+ status_log.append(f"\n❌ Persona builder failed (HTTP {r.status_code}): {r.text}")
127
+
128
+ except Exception as e:
129
+ status_log.append(f"\n❌ Persona builder failed: {e}")
130
+
131
+ # Fallback return if the final step failed
132
+ return "\n\n".join(status_log), None, gr.update()
133
 
134
 
 
135
  def chat_with_persona(persona_id, message, history, language, enable_voice):
136
  history = history or []
137
 
 
142
  if not message.strip():
143
  return "", history, None
144
 
145
+ try:
146
+ r = requests.post(CHAT_URL, json={
 
147
  "persona_id": persona_id.strip(),
148
  "history": [{"role": m["role"], "content": m["content"]} for m in history],
149
  "message": message.strip(),
150
  "language": language,
151
+ }, timeout=180)
152
+ result = r.json()
 
 
 
 
 
 
153
  response_text = result.get("text", result.get("response", "..."))
154
  voice_desc = result.get("voice_description", "warm elderly voice")
155
+ except Exception as e:
156
+ response_text = f"⚠️ Error: {e}"
157
+ voice_desc = "warm elderly voice"
158
 
159
  history = history + [
160
  {"role": "user", "content": message},
161
  {"role": "assistant", "content": response_text},
162
  ]
163
 
164
+ # Generate voice response (VoxCPM2)
165
  audio_path = None
166
+ if enable_voice:
167
  try:
168
  r = requests.post(TTS_URL, json={
169
  "text": response_text,
170
  "voice_description": voice_desc,
171
  }, timeout=180)
172
+ if r.status_code == 200:
173
  import tempfile
174
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
175
  f.write(r.content)
 
181
 
182
 
183
  def load_personas():
184
+ for attempt in range(2):
185
+ try:
186
+ r = requests.get(LIST_PERSONAS_URL, timeout=90)
187
+ personas = r.json().get("personas", [])
188
+ if not personas:
189
+ return "No personas saved yet."
190
+ lines = [f"**{p['name']}** ({p['relationship']}) β€” ID: `{p['id']}`" for p in personas]
191
+ return "\n\n".join(lines)
192
+ except Exception as e:
193
+ if attempt == 0:
194
+ continue
195
+ return f"⚠️ Modal is waking up, please try again in 30 seconds."
196
 
197
 
198
  # ── UI ────────────────────────────────────────────────────────────────────────
 
218
  <div class="header-sub">Preserve the voice of someone you love. Talk to them again.</div>
219
  <hr class="divider">
220
  <div style="text-align:center; margin-bottom:16px;">
221
+ <span class="model-badge">🧠 Qwen2.5-32B</span>
222
+ <span class="model-badge">🎀 Cohere Transcribe</span>
223
+ <span class="model-badge">πŸ‘οΈ MiniCPM-V 4.6</span>
224
+ <span class="model-badge">πŸ“„ Nemotron Parse</span>
225
+ <span class="model-badge">πŸ”Š VoxCPM2</span>
226
+ <span class="model-badge">🌍 Tiny Aya Fire</span>
 
 
227
  </div>
228
  """)
229
 
 
234
  gr.HTML("<p style='color:#8a7560; font-style:italic; margin-bottom:16px;'>Upload letters, photos, voice notes, or scanned documents. Each is processed by a specialized AI model.</p>")
235
 
236
  with gr.Row():
237
+ name_input = gr.Textbox(label="Their Name", placeholder="e.g. Dadu, Nana, Abba...")
238
+ relationship_input = gr.Textbox(label="Your Relationship", placeholder="e.g. Grandfather, Mother...")
239
 
240
  text_input = gr.Textbox(
241
  label="πŸ“ Letters / Diary Entries / Writings",
242
+ placeholder="Paste their writings here. Separate multiple entries with ---",
243
  lines=6,
244
  )
245
 
 
255
 
256
  photo_captions = gr.Textbox(
257
  label="πŸ–ΌοΈ Manual Photo Captions (optional, one per line)",
258
+ placeholder="Or describe photos manually here...",
259
  lines=3,
260
  )
261
 
 
291
  chatbot = gr.Chatbot(label="", height=420, placeholder="*Their words will appear here...*")
292
 
293
  with gr.Row():
294
+ msg_input = gr.Textbox(label="Your message", placeholder="What would you like to say?", lines=2, scale=4)
295
  send_btn = gr.Button("Send β†’", variant="primary", scale=1)
296
 
297
  voice_output = gr.Audio(label="πŸ”Š Voice Response", visible=True, autoplay=True)