rairo commited on
Commit
3b7bd59
·
verified ·
1 Parent(s): 2ef4d06

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +510 -372
main.py CHANGED
@@ -1,21 +1,32 @@
1
  """
2
- main.py — Iris AI Service (v1.0 - April 2026)
3
 
4
  AI layer for the Iris Support Portal (IrisPlus / Unified Spark Desk).
5
  Deployed as a HuggingFace Space monofile (Flask + Gemini + AssemblyAI + Firebase).
6
 
 
 
 
 
 
 
 
 
 
 
7
  FEATURES:
8
- 1. WhatsApp Export → Knowledge Base (intelligent Gemini extraction, additive only)
9
  2. Bulk KB Upload (CSV / Excel / PDF)
10
- 3. Natural Language + Voice Ticket Submission (AssemblyAI transcription → Gemini extraction)
11
- 4. System Tutorial Ingestion (video transcript → timestamped KB articles)
12
- 5. Agent NL/Voice Solution Writing (same pipeline, agent role)
13
- 6. Iris Chatbot (KB + tutorial source RAG, Firebase persistence)
14
 
15
  ENV VARS:
16
  GOOGLE_API_KEY — Gemini API key
17
  ASSEMBLYAI_API_KEY — AssemblyAI API key
18
  FIREBASE — JSON string of Firebase service account
 
19
  PORT — Server port (default 7860)
20
  """
21
 
@@ -27,8 +38,10 @@ import time
27
  import logging
28
  import base64
29
  import hashlib
 
 
30
  from datetime import datetime, timezone
31
- from typing import Any, Dict, List, Optional
32
 
33
  import requests
34
  from flask import Flask, request, jsonify
@@ -52,7 +65,8 @@ except Exception as e:
52
  logger.error("google-genai not installed: %s", e)
53
 
54
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
55
- GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.5-flash-lite")
 
56
 
57
  _gemini_client = None
58
  if genai and GOOGLE_API_KEY:
@@ -99,7 +113,7 @@ def init_firestore() -> Optional[Any]:
99
 
100
  db = init_firestore()
101
 
102
- # ─── Optional file-parsing libs ───────────────────────────────────────────────
103
 
104
  try:
105
  import pandas as pd
@@ -119,28 +133,98 @@ app = Flask(__name__)
119
  CORS(app)
120
 
121
  # ══════════════════════════════════════════════════════════════════════════════
122
- # HELPERS
123
  # ══════════════════════════════════════════════════════════════════════════════
124
 
 
 
 
 
 
 
 
 
 
 
125
  def _safe_json(text: str, fallback: Any) -> Any:
126
- """Strip markdown fences and parse JSON safely."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  try:
128
- clean = text.strip()
129
- if "```json" in clean:
130
- clean = clean.split("```json")[1].split("```")[0]
131
- elif "```" in clean:
132
- clean = clean.split("```")[1].split("```")[0]
133
  return json.loads(clean)
134
- except Exception as e:
135
- logger.error("JSON parse error: %s | text: %s", e, text[:200])
136
- return fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
 
139
  def _gemini_text(prompt: str, json_mode: bool = False) -> str:
140
- """Call Gemini and return raw text."""
141
  if not _gemini_client:
142
  return ""
143
- cfg = genai_types.GenerateContentConfig(response_mime_type="application/json") if json_mode else None
 
 
144
  try:
145
  resp = _gemini_client.models.generate_content(
146
  model=GEMINI_MODEL,
@@ -149,18 +233,35 @@ def _gemini_text(prompt: str, json_mode: bool = False) -> str:
149
  )
150
  return resp.text or ""
151
  except Exception as e:
152
- logger.error("Gemini call error: %s", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  return ""
154
 
155
 
156
  def _article_fingerprint(title: str, content: str) -> str:
157
- """Stable hash to detect duplicate KB articles."""
158
  raw = f"{title.strip().lower()}::{content.strip().lower()[:300]}"
159
  return hashlib.sha256(raw.encode()).hexdigest()[:16]
160
 
161
 
162
  def _get_existing_fingerprints() -> set:
163
- """Fetch all fingerprints already in Firestore KB."""
164
  if not db:
165
  return set()
166
  try:
@@ -172,22 +273,17 @@ def _get_existing_fingerprints() -> set:
172
 
173
 
174
  def _save_kb_articles(articles: List[Dict], source_label: str) -> Dict:
175
- """Save articles to Firestore, skip duplicates. Returns stats."""
176
  if not db:
177
  return {"saved": 0, "skipped": 0, "error": "Firebase unavailable"}
178
-
179
  existing = _get_existing_fingerprints()
180
  saved, skipped = 0, 0
181
-
182
  for article in articles:
183
  title = article.get("title", "Untitled")
184
  content = article.get("content", "")
185
  fp = _article_fingerprint(title, content)
186
-
187
  if fp in existing:
188
  skipped += 1
189
  continue
190
-
191
  doc = {
192
  "title": title,
193
  "content": content,
@@ -197,79 +293,327 @@ def _save_kb_articles(articles: List[Dict], source_label: str) -> Dict:
197
  "fingerprint": fp,
198
  "created_at": datetime.now(timezone.utc).isoformat(),
199
  }
200
- # Carry timestamp crop info from tutorial ingestion if present
201
  if article.get("timestamp_start") is not None:
202
  doc["timestamp_start"] = article["timestamp_start"]
203
  doc["timestamp_end"] = article.get("timestamp_end")
204
  doc["video_url"] = article.get("video_url", "")
205
-
206
  db.collection("iris_kb_articles").add(doc)
207
  existing.add(fp)
208
  saved += 1
209
-
210
  return {"saved": saved, "skipped": skipped}
211
 
212
 
213
  # ══════════════════════════════════════════════════════════════════════════════
214
- # FEATURE 1 — WhatsApp Export → Knowledge Base
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  # ══════════════════════════════════════════════════════════════════════════════
216
 
217
- WHATSAPP_EXTRACTION_PROMPT = """
218
- You are a support knowledge base curator.
 
219
 
220
- You have been given a raw WhatsApp group chat export from a support team.
221
- Your job is to extract ONLY clear problem→solution pairs.
 
 
 
 
 
 
222
 
223
- Rules:
224
- - Ignore greetings, off-topic chatter, emoji-only messages, system notifications.
225
- - Extract only exchanges where a user described an issue AND a support agent (or another user) provided a working solution.
226
- - Each article must be self-contained and searchable.
227
- - Merge follow-up messages that belong to the same resolution thread.
 
 
 
 
228
 
229
- Return a STRICT JSON array. Each object:
230
- {
231
- "title": "Short, searchable title of the issue",
232
- "content": "Full explanation: what the problem was and the step-by-step solution",
233
- "category": "One of: Account, Billing, Technical, Feature, Other",
234
- "tags": ["array", "of", "relevant", "keywords"]
235
- }
236
 
237
- Return ONLY the JSON array, no other text.
 
238
 
239
- WhatsApp Export:
 
 
240
  """
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  @app.post("/api/kb/whatsapp-import")
243
  def whatsapp_import():
244
  """
245
- POST body: { "chat_text": "<raw WhatsApp export text>" }
246
- Extracts problem→solution pairs, saves new articles (additive, no overwrite).
 
 
 
 
247
  """
248
- body = request.get_json(silent=True) or {}
249
- raw_chat = body.get("chat_text", "").strip()
250
 
251
- if not raw_chat:
252
- return jsonify({"ok": False, "error": "chat_text is required"}), 400
 
 
 
 
253
 
254
- if len(raw_chat) < 100:
255
- return jsonify({"ok": False, "error": "Chat export too short to process"}), 400
256
 
257
- logger.info("WhatsApp import: %d chars received", len(raw_chat))
 
 
258
 
259
- gemini_out = _gemini_text(WHATSAPP_EXTRACTION_PROMPT + raw_chat[:50000], json_mode=True)
260
- articles = _safe_json(gemini_out, [])
261
 
262
- if not isinstance(articles, list):
263
- return jsonify({"ok": False, "error": "Gemini returned unexpected format", "raw": gemini_out[:500]}), 500
264
 
265
- stats = _save_kb_articles(articles, source_label="whatsapp_export")
266
- logger.info("WhatsApp import complete: %s", stats)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
  return jsonify({
269
- "ok": True,
270
- "articles_found": len(articles),
271
- "saved": stats["saved"],
272
- "skipped_dupes": stats["skipped"],
273
  })
274
 
275
 
@@ -278,7 +622,6 @@ def whatsapp_import():
278
  # ══════════════════════════════════════════════════════════════════════════════
279
 
280
  def _extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
281
- """Extract text from a PDF using pypdf, fallback to Gemini vision."""
282
  if PYPDF_AVAILABLE:
283
  try:
284
  reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
@@ -288,12 +631,9 @@ def _extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
288
  return text
289
  except Exception as e:
290
  logger.warning("pypdf extraction failed: %s", e)
291
-
292
- # Gemini inline_data fallback for scanned PDFs
293
  if _gemini_client:
294
  try:
295
- b64_pdf = base64.b64encode(pdf_bytes).decode()
296
- resp = _gemini_client.models.generate_content(
297
  model=GEMINI_MODEL,
298
  contents=[
299
  "Extract all text from this PDF document. Return plain text only.",
@@ -306,61 +646,41 @@ def _extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
306
  return ""
307
 
308
 
309
- PDF_KB_PROMPT = """
310
- You are a support knowledge base curator.
311
  Convert the following document content into structured KB articles.
312
- Each article should cover one distinct topic, issue, or procedure.
313
 
314
- Return a STRICT JSON array. Each object:
315
- {
316
- "title": "Short, searchable title",
317
- "content": "Complete explanation in clear language",
318
- "category": "One of: Account, Billing, Technical, Feature, Other",
319
- "tags": ["keyword1", "keyword2"]
320
- }
321
 
322
- Return ONLY the JSON array.
 
323
 
324
  Document content:
325
  """
326
 
327
  @app.post("/api/kb/bulk-upload")
328
  def bulk_upload():
329
- """
330
- Accepts multipart file upload. Supports: .csv, .xlsx, .xls, .pdf
331
- CSV/Excel expected columns: title, content (+ optional: category, tags)
332
- PDF: Gemini extracts and structures articles.
333
- """
334
  if "file" not in request.files:
335
  return jsonify({"ok": False, "error": "No file uploaded"}), 400
336
-
337
  f = request.files["file"]
338
  filename = f.filename or ""
339
  ext = filename.rsplit(".", 1)[-1].lower()
340
  file_data = f.read()
341
-
342
- articles = []
343
 
344
  if ext in ("csv", "xlsx", "xls"):
345
  if not PANDAS_AVAILABLE:
346
  return jsonify({"ok": False, "error": "pandas not installed on server"}), 500
347
  try:
348
- if ext == "csv":
349
- df = pd.read_csv(io.BytesIO(file_data))
350
- else:
351
- df = pd.read_excel(io.BytesIO(file_data))
352
-
353
  df.columns = [c.strip().lower() for c in df.columns]
354
-
355
  if "title" not in df.columns or "content" not in df.columns:
356
  return jsonify({"ok": False, "error": "CSV/Excel must have 'title' and 'content' columns"}), 400
357
-
358
  for _, row in df.iterrows():
359
  tags = []
360
  if "tags" in df.columns and pd.notna(row.get("tags")):
361
- raw_tags = str(row["tags"])
362
- tags = [t.strip() for t in re.split(r"[,;|]", raw_tags) if t.strip()]
363
-
364
  articles.append({
365
  "title": str(row["title"]).strip(),
366
  "content": str(row["content"]).strip(),
@@ -368,100 +688,75 @@ def bulk_upload():
368
  "tags": tags,
369
  })
370
  except Exception as e:
371
- logger.error("Spreadsheet parse error: %s", e)
372
  return jsonify({"ok": False, "error": f"Could not parse file: {e}"}), 400
373
 
374
  elif ext == "pdf":
375
  text = _extract_text_from_pdf_bytes(file_data)
376
  if not text:
377
  return jsonify({"ok": False, "error": "Could not extract text from PDF"}), 400
378
-
379
- gemini_out = _gemini_text(PDF_KB_PROMPT + text[:50000], json_mode=True)
380
- articles = _safe_json(gemini_out, [])
381
-
382
- if not isinstance(articles, list):
383
- return jsonify({"ok": False, "error": "Gemini PDF structuring failed"}), 500
384
  else:
385
- return jsonify({"ok": False, "error": f"Unsupported file type: .{ext}. Use csv, xlsx, or pdf"}), 400
386
 
387
  if not articles:
388
  return jsonify({"ok": False, "error": "No articles extracted from file"}), 400
389
 
390
  stats = _save_kb_articles(articles, source_label=f"bulk_upload:{filename}")
391
- return jsonify({
392
- "ok": True,
393
- "articles_found": len(articles),
394
- "saved": stats["saved"],
395
- "skipped_dupes": stats["skipped"],
396
- })
397
 
398
 
399
  # ══════════════════════════════════════════════════════════════════════════════
400
  # FEATURE 3 — Ticket Submission via NL Text or Voice
401
  # ══════════════════════════════════════════════════════════════════════════════
402
 
403
- TICKET_EXTRACTION_PROMPT = """
404
- You are a support ticket intake system for a software support portal.
405
 
406
  A user has described their issue in natural language. Extract structured ticket fields.
407
 
408
- Return STRICT JSON (no other text):
409
- {
410
- "title": "Concise ticket title (max 80 chars)",
411
- "description": "Full detailed description of the issue, rewritten clearly in third person",
412
- "category_hint": "Best matching category: Account | Billing | Technical | Feature | Other",
413
- "priority_hint": "One of: low | medium | high | critical (based on urgency language)",
414
- "keywords": ["array", "of", "technical", "keywords"]
415
- }
416
 
417
- User's message:
 
 
 
418
  """
419
 
420
  def _transcribe_audio_assemblyai(audio_b64: str, audio_format: str = "wav") -> str:
421
- """Upload audio to AssemblyAI and poll for transcript."""
422
  if not ASSEMBLYAI_API_KEY:
423
  return ""
424
-
425
  audio_bytes = base64.b64decode(audio_b64)
426
  headers = {"authorization": ASSEMBLYAI_API_KEY}
427
-
428
- # 1. Upload
429
  try:
430
  upload_resp = requests.post(
431
  f"{ASSEMBLYAI_BASE}/upload",
432
  headers={**headers, "Content-Type": "application/octet-stream"},
433
- data=audio_bytes,
434
- timeout=30
435
  )
436
  upload_resp.raise_for_status()
437
  upload_url = upload_resp.json().get("upload_url")
438
  except Exception as e:
439
  logger.error("AssemblyAI upload error: %s", e)
440
  return ""
441
-
442
- # 2. Request transcript
443
  try:
444
  tx_resp = requests.post(
445
  f"{ASSEMBLYAI_BASE}/transcript",
446
  headers={**headers, "Content-Type": "application/json"},
447
- json={"audio_url": upload_url, "language_detection": True},
448
- timeout=15
449
  )
450
  tx_resp.raise_for_status()
451
  tx_id = tx_resp.json().get("id")
452
  except Exception as e:
453
  logger.error("AssemblyAI transcript request error: %s", e)
454
  return ""
455
-
456
- # 3. Poll
457
  for _ in range(30):
458
  time.sleep(3)
459
  try:
460
- poll = requests.get(
461
- f"{ASSEMBLYAI_BASE}/transcript/{tx_id}",
462
- headers=headers,
463
- timeout=15
464
- )
465
  poll.raise_for_status()
466
  result = poll.json()
467
  status = result.get("status")
@@ -477,74 +772,47 @@ def _transcribe_audio_assemblyai(audio_b64: str, audio_format: str = "wav") -> s
477
 
478
  @app.post("/api/tickets/submit-nl")
479
  def submit_ticket_nl():
480
- """
481
- POST body: { "message": "I can't log in, it says my account is locked...", "user_id": "..." }
482
- Returns structured ticket fields for the frontend to pre-fill and submit.
483
- """
484
  body = request.get_json(silent=True) or {}
485
  message = body.get("message", "").strip()
486
  user_id = body.get("user_id", "anonymous")
487
-
488
  if not message:
489
  return jsonify({"ok": False, "error": "message is required"}), 400
490
-
491
- gemini_out = _gemini_text(TICKET_EXTRACTION_PROMPT + message, json_mode=True)
492
- ticket = _safe_json(gemini_out, {})
493
-
494
- if not ticket.get("title"):
495
  return jsonify({"ok": False, "error": "Could not extract ticket info from message"}), 500
496
-
497
- # Log submission attempt
498
  if db:
499
  db.collection("iris_ai_ticket_drafts").add({
500
- "user_id": user_id,
501
- "raw_input": message,
502
- "extracted": ticket,
503
- "channel": "nl_text",
504
  "created_at": datetime.now(timezone.utc).isoformat(),
505
  })
506
-
507
  return jsonify({"ok": True, "ticket": ticket})
508
 
509
 
510
  @app.post("/api/tickets/submit-voice")
511
  def submit_ticket_voice():
512
- """
513
- POST body: { "audio_b64": "<base64 audio>", "audio_format": "wav", "user_id": "..." }
514
- Transcribes audio via AssemblyAI, then extracts ticket via Gemini.
515
- """
516
  body = request.get_json(silent=True) or {}
517
  audio_b64 = body.get("audio_b64", "")
518
  audio_format = body.get("audio_format", "wav")
519
  user_id = body.get("user_id", "anonymous")
520
-
521
  if not audio_b64:
522
  return jsonify({"ok": False, "error": "audio_b64 is required"}), 400
523
-
524
  if not ASSEMBLYAI_API_KEY:
525
  return jsonify({"ok": False, "error": "AssemblyAI not configured on server"}), 500
526
-
527
- logger.info("Voice ticket: transcribing audio for user=%s", user_id)
528
  transcript = _transcribe_audio_assemblyai(audio_b64, audio_format)
529
-
530
  if not transcript:
531
  return jsonify({"ok": False, "error": "Transcription failed or returned empty result"}), 500
532
-
533
- gemini_out = _gemini_text(TICKET_EXTRACTION_PROMPT + transcript, json_mode=True)
534
- ticket = _safe_json(gemini_out, {})
535
-
536
- if not ticket.get("title"):
537
  return jsonify({"ok": False, "error": "Could not extract ticket info from transcript"}), 500
538
-
539
  if db:
540
  db.collection("iris_ai_ticket_drafts").add({
541
- "user_id": user_id,
542
- "raw_input": transcript,
543
- "extracted": ticket,
544
- "channel": "voice",
545
  "created_at": datetime.now(timezone.utc).isoformat(),
546
  })
547
-
548
  return jsonify({"ok": True, "transcript": transcript, "ticket": ticket})
549
 
550
 
@@ -552,32 +820,22 @@ def submit_ticket_voice():
552
  # FEATURE 4 — System Tutorial Ingestion
553
  # ══════════════════════════════════════════════════════════════════════════════
554
 
555
- TUTORIAL_EXTRACTION_PROMPT = """
556
- You are a knowledge base curator for a software support system.
557
 
558
- You have been given a timestamped transcript from a video tutorial about the Iris Support Portal.
559
- Your job is to extract discrete how-to articles, one per distinct feature or task demonstrated.
560
 
561
- For each article, identify the best timestamp range where the solution or demonstration occurs.
 
562
 
563
- Return a STRICT JSON array. Each object:
564
- {
565
- "title": "How to <do something> in Iris",
566
- "content": "Step-by-step instructions based on the tutorial",
567
- "category": "One of: Account | Tickets | Agents | Reports | Admin | Other",
568
- "tags": ["keyword1", "keyword2"],
569
- "timestamp_start": <seconds as integer>,
570
- "timestamp_end": <seconds as integer>
571
- }
572
 
573
- Return ONLY the JSON array.
574
-
575
- Transcript (with timestamps in [MM:SS] or [HH:MM:SS] format):
576
  """
577
 
578
  def _parse_timestamp_to_seconds(ts: str) -> int:
579
- """Convert MM:SS or HH:MM:SS string to integer seconds."""
580
- parts = ts.strip("[]").split(":")
581
  try:
582
  if len(parts) == 2:
583
  return int(parts[0]) * 60 + int(parts[1])
@@ -587,169 +845,101 @@ def _parse_timestamp_to_seconds(ts: str) -> int:
587
  pass
588
  return 0
589
 
590
-
591
  @app.post("/api/kb/tutorial-ingest")
592
  def tutorial_ingest():
593
- """
594
- POST body: {
595
- "transcript": "<timestamped transcript text>",
596
- "video_url": "https://...", (optional, for linking crop timestamps)
597
- "video_title": "Getting Started with Iris"
598
- }
599
- Gemini extracts how-to articles with timestamp ranges.
600
- """
601
  body = request.get_json(silent=True) or {}
602
  transcript = body.get("transcript", "").strip()
603
  video_url = body.get("video_url", "")
604
  video_title = body.get("video_title", "Tutorial")
605
-
606
  if not transcript:
607
  return jsonify({"ok": False, "error": "transcript is required"}), 400
608
-
609
- logger.info("Tutorial ingest: %d chars, title=%s", len(transcript), video_title)
610
-
611
- gemini_out = _gemini_text(TUTORIAL_EXTRACTION_PROMPT + transcript[:50000], json_mode=True)
612
- articles = _safe_json(gemini_out, [])
613
-
614
- if not isinstance(articles, list):
615
- return jsonify({"ok": False, "error": "Gemini returned unexpected format"}), 500
616
-
617
- # Inject video metadata into each article
618
  for a in articles:
619
  a["video_url"] = video_url
620
  a["video_title"] = video_title
621
- # Ensure numeric seconds (Gemini may return the parsed value; validate it)
622
  for ts_key in ("timestamp_start", "timestamp_end"):
623
  val = a.get(ts_key)
624
  if isinstance(val, str):
625
  a[ts_key] = _parse_timestamp_to_seconds(val)
626
  elif not isinstance(val, int):
627
  a[ts_key] = 0
628
-
629
  stats = _save_kb_articles(articles, source_label=f"tutorial:{video_title}")
630
-
631
- return jsonify({
632
- "ok": True,
633
- "video_title": video_title,
634
- "articles_found": len(articles),
635
- "saved": stats["saved"],
636
- "skipped_dupes": stats["skipped"],
637
- })
638
 
639
 
640
  # ══════════════════════════════════════════════════════════════════════════════
641
  # FEATURE 5 — Agent Solution Writing (NL Text + Voice)
642
  # ══════════════════════════════════════════════════════════════════════════════
643
 
644
- SOLUTION_EXTRACTION_PROMPT = """
645
- You are a support knowledge base curator.
646
-
647
- An agent or support staff has described a solution they discovered while resolving a ticket.
648
  Structure this into a reusable KB article.
649
 
650
- Return STRICT JSON:
651
- {
652
- "title": "Short, searchable problem title",
653
- "content": "Clear step-by-step solution, rewritten for future reference",
654
- "category": "One of: Account | Billing | Technical | Feature | Other",
655
- "tags": ["relevant", "keywords"]
656
- }
657
 
658
- Agent's description:
659
  """
660
 
661
  @app.post("/api/kb/agent-solution-nl")
662
  def agent_solution_nl():
663
- """
664
- POST body: { "message": "I fixed ticket #123 by...", "agent_id": "...", "ticket_id": "..." }
665
- Creates a KB article from agent's natural language solution description.
666
- """
667
  body = request.get_json(silent=True) or {}
668
  message = body.get("message", "").strip()
669
  agent_id = body.get("agent_id", "unknown")
670
  ticket_id = body.get("ticket_id", "")
671
-
672
  if not message:
673
  return jsonify({"ok": False, "error": "message is required"}), 400
674
-
675
- gemini_out = _gemini_text(SOLUTION_EXTRACTION_PROMPT + message, json_mode=True)
676
- article = _safe_json(gemini_out, {})
677
-
678
- if not article.get("title"):
679
  return jsonify({"ok": False, "error": "Could not structure solution"}), 500
680
-
681
- # Add ticket reference tag
682
  if ticket_id:
683
  article.setdefault("tags", []).append(f"ticket:{ticket_id}")
684
-
685
  stats = _save_kb_articles([article], source_label=f"agent:{agent_id}")
686
-
687
- return jsonify({
688
- "ok": True,
689
- "saved": stats["saved"],
690
- "article": article,
691
- })
692
 
693
 
694
  @app.post("/api/kb/agent-solution-voice")
695
  def agent_solution_voice():
696
- """
697
- POST body: { "audio_b64": "...", "audio_format": "wav", "agent_id": "...", "ticket_id": "..." }
698
- Transcribes agent's voice note, structures into KB article.
699
- """
700
  body = request.get_json(silent=True) or {}
701
  audio_b64 = body.get("audio_b64", "")
702
  audio_format = body.get("audio_format", "wav")
703
  agent_id = body.get("agent_id", "unknown")
704
  ticket_id = body.get("ticket_id", "")
705
-
706
  if not audio_b64:
707
  return jsonify({"ok": False, "error": "audio_b64 is required"}), 400
708
-
709
  transcript = _transcribe_audio_assemblyai(audio_b64, audio_format)
710
-
711
  if not transcript:
712
  return jsonify({"ok": False, "error": "Transcription failed"}), 500
713
-
714
- gemini_out = _gemini_text(SOLUTION_EXTRACTION_PROMPT + transcript, json_mode=True)
715
- article = _safe_json(gemini_out, {})
716
-
717
- if not article.get("title"):
718
  return jsonify({"ok": False, "error": "Could not structure solution from transcript"}), 500
719
-
720
  if ticket_id:
721
  article.setdefault("tags", []).append(f"ticket:{ticket_id}")
722
-
723
  stats = _save_kb_articles([article], source_label=f"agent:{agent_id}")
724
-
725
- return jsonify({
726
- "ok": True,
727
- "transcript": transcript,
728
- "saved": stats["saved"],
729
- "article": article,
730
- })
731
 
732
 
733
  # ══════════════════════════════════════════════════════════════════════════════
734
- # FEATURE 6 — Iris Support Chatbot (RAG over KB + Tutorials)
735
  # ══════════════════════════════════════════════════════════════════════════════
736
 
737
  def _search_kb(query: str, limit: int = 5) -> List[Dict]:
738
- """
739
- Simple keyword search over Firestore KB articles.
740
- Production upgrade: swap with a vector DB (e.g. Qdrant) or Vertex AI Search.
741
- """
742
  if not db:
743
  return []
744
-
745
  query_terms = [t.lower() for t in query.split() if len(t) > 2]
746
-
747
  try:
748
- # Fetch recent articles (Firestore doesn't support full-text, this is a lightweight approach)
749
  docs = db.collection("iris_kb_articles").order_by(
750
  "created_at", direction=firestore.Query.DESCENDING
751
  ).limit(200).stream()
752
-
753
  results = []
754
  for doc in docs:
755
  d = doc.to_dict()
@@ -757,60 +947,40 @@ def _search_kb(query: str, limit: int = 5) -> List[Dict]:
757
  score = sum(1 for term in query_terms if term in text)
758
  if score > 0:
759
  results.append({"score": score, **d})
760
-
761
  results.sort(key=lambda x: x["score"], reverse=True)
762
  return results[:limit]
763
-
764
  except Exception as e:
765
  logger.error("KB search error: %s", e)
766
  return []
767
 
768
 
769
- CHATBOT_SYSTEM_PROMPT = """
770
- You are Iris, an intelligent support assistant for the Iris Support Portal.
771
 
772
- Your role: Help users resolve issues quickly using the knowledge base and tutorial content provided.
773
-
774
- Rules:
775
- - Answer ONLY from the provided context. Do not hallucinate solutions.
776
- - If the answer is in a tutorial with a timestamp, mention the video and timestamp so the user can jump to that moment.
777
- - Be concise, clear, and friendly.
778
- - If you cannot find the answer, say so honestly and suggest submitting a ticket.
779
- - Format step-by-step answers as numbered lists.
780
  """
781
 
782
  @app.post("/api/chatbot/query")
783
  def chatbot_query():
784
- """
785
- POST body: {
786
- "message": "How do I reset a user's password?",
787
- "session_id": "...",
788
- "user_id": "..."
789
- }
790
- RAG: searches KB, then uses Gemini to synthesize an answer.
791
- """
792
  body = request.get_json(silent=True) or {}
793
  message = body.get("message", "").strip()
794
  session_id = body.get("session_id", "default")
795
  user_id = body.get("user_id", "anonymous")
796
-
797
  if not message:
798
  return jsonify({"ok": False, "error": "message is required"}), 400
799
-
800
- # Retrieve relevant KB context
801
  kb_results = _search_kb(message, limit=5)
802
-
803
  context_blocks = []
804
  sources = []
805
  for r in kb_results:
806
  block = f"[Article: {r.get('title')}]\n{r.get('content', '')}"
807
  if r.get("timestamp_start") is not None:
808
- ts = r["timestamp_start"]
809
- mm = ts // 60
810
- ss = ts % 60
811
- url = r.get("video_url", "")
812
- block += f"\n(Tutorial: {r.get('video_title','Video')} at {mm:02d}:{ss:02d}"
813
- block += f" — {url})" if url else ")"
814
  context_blocks.append(block)
815
  sources.append({
816
  "title": r.get("title"),
@@ -819,75 +989,43 @@ def chatbot_query():
819
  "ts_start": r.get("timestamp_start"),
820
  "video_url": r.get("video_url"),
821
  })
822
-
823
- context_str = "\n\n---\n\n".join(context_blocks) if context_blocks else "No relevant articles found."
824
-
825
- full_prompt = f"""{CHATBOT_SYSTEM_PROMPT}
826
-
827
- KNOWLEDGE BASE CONTEXT:
828
- {context_str}
829
-
830
- USER QUESTION: {message}
831
-
832
- Answer:"""
833
-
834
- answer = _gemini_text(full_prompt)
835
-
836
  if not answer:
837
- answer = "I'm sorry, I couldn't process your question right now. Please try again or submit a support ticket."
838
-
839
- # Persist chat log
840
  if db:
841
  db.collection("iris_chatbot_logs").add({
842
- "user_id": user_id,
843
- "session_id": session_id,
844
- "message": message,
845
- "answer": answer,
846
- "sources": sources,
847
  "created_at": datetime.now(timezone.utc).isoformat(),
848
  })
849
-
850
- return jsonify({
851
- "ok": True,
852
- "answer": answer,
853
- "sources": sources,
854
- })
855
 
856
 
857
  # ══════════════════════════════════════════════════════════════════════════════
858
- # KB READ ENDPOINTS (for frontend display)
859
  # ══════════════════════════════════════════════════════════════════════════════
860
 
861
  @app.get("/api/kb/articles")
862
  def list_kb_articles():
863
- """
864
- GET /api/kb/articles?category=Technical&limit=50
865
- Lists KB articles, optionally filtered by category.
866
- """
867
  category = request.args.get("category", "")
868
  limit = int(request.args.get("limit", 50))
869
-
870
  if not db:
871
  return jsonify({"ok": False, "error": "Firebase unavailable"}), 500
872
-
873
  try:
874
- query = db.collection("iris_kb_articles").order_by(
875
- "created_at", direction=firestore.Query.DESCENDING
876
- )
877
  if category:
878
  query = query.where("category", "==", category)
879
-
880
  docs = query.limit(limit).stream()
881
  articles = [{"id": d.id, **d.to_dict()} for d in docs]
882
  return jsonify({"ok": True, "articles": articles, "count": len(articles)})
883
  except Exception as e:
884
- logger.error("KB list error: %s", e)
885
  return jsonify({"ok": False, "error": str(e)}), 500
886
 
887
 
888
  @app.delete("/api/kb/articles/<article_id>")
889
  def delete_kb_article(article_id: str):
890
- """DELETE /api/kb/articles/<id> — Admin only (JWT check to be enforced at gateway)"""
891
  if not db:
892
  return jsonify({"ok": False, "error": "Firebase unavailable"}), 500
893
  try:
@@ -910,14 +1048,14 @@ def health():
910
  article_count = docs[0][0].value
911
  except Exception:
912
  pass
913
-
914
  return jsonify({
915
- "ok": True,
916
- "service": "Iris AI Service v1.0",
917
- "gemini": bool(_gemini_client),
918
- "assemblyai": bool(ASSEMBLYAI_API_KEY),
919
- "firebase": bool(db),
920
- "kb_articles": article_count,
 
921
  })
922
 
923
 
@@ -927,5 +1065,5 @@ def health():
927
 
928
  if __name__ == "__main__":
929
  port = int(os.environ.get("PORT", 7860))
930
- logger.info("Iris AI Service starting on port %d", port)
931
  app.run(host="0.0.0.0", port=port)
 
1
  """
2
+ main.py — Iris AI Service (v1.1 - April 2026)
3
 
4
  AI layer for the Iris Support Portal (IrisPlus / Unified Spark Desk).
5
  Deployed as a HuggingFace Space monofile (Flask + Gemini + AssemblyAI + Firebase).
6
 
7
+ CHANGELOG v1.1:
8
+ - Model: gemini-3.1-flash-lite-preview (multimodal reasoning)
9
+ - /api/kb/whatsapp-import: now accepts multipart ZIP upload
10
+ * Extracts _chat.txt + maps image files to <Media omitted> pointers
11
+ * Sliding-window chunking (~10k tokens / ~40k chars with overlap)
12
+ * Multimodal: sends images inline with their surrounding text chunk
13
+ * Strict JSON enforcement + pre-save validation
14
+ * JSON parse error recovery (regex extraction fallback)
15
+ - All other endpoints unchanged from v1.0
16
+
17
  FEATURES:
18
+ 1. WhatsApp Export → Knowledge Base (ZIP multimodal, chunked, additive)
19
  2. Bulk KB Upload (CSV / Excel / PDF)
20
+ 3. Natural Language + Voice Ticket Submission
21
+ 4. System Tutorial Ingestion (timestamped transcripts)
22
+ 5. Agent NL/Voice Solution Writing
23
+ 6. Iris Chatbot (KB RAG)
24
 
25
  ENV VARS:
26
  GOOGLE_API_KEY — Gemini API key
27
  ASSEMBLYAI_API_KEY — AssemblyAI API key
28
  FIREBASE — JSON string of Firebase service account
29
+ GEMINI_MODEL — Override model (default: gemini-3.1-flash-lite-preview)
30
  PORT — Server port (default 7860)
31
  """
32
 
 
38
  import logging
39
  import base64
40
  import hashlib
41
+ import zipfile
42
+ import tempfile
43
  from datetime import datetime, timezone
44
+ from typing import Any, Dict, List, Optional, Tuple
45
 
46
  import requests
47
  from flask import Flask, request, jsonify
 
65
  logger.error("google-genai not installed: %s", e)
66
 
67
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
68
+ # v1.1: upgraded to gemini-3.1-flash-lite-preview for multimodal reasoning
69
+ GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-3.1-flash-lite-preview")
70
 
71
  _gemini_client = None
72
  if genai and GOOGLE_API_KEY:
 
113
 
114
  db = init_firestore()
115
 
116
+ # ─── Optional libs ────────────────────────────────────────────────────────────
117
 
118
  try:
119
  import pandas as pd
 
133
  CORS(app)
134
 
135
  # ══════════════════════════════════════════════════════════════════════════════
136
+ # SHARED HELPERS
137
  # ══════════════════════════════════════════════════════════════════════════════
138
 
139
+ # Supported image extensions for multimodal WhatsApp ingestion
140
+ SUPPORTED_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
141
+
142
+ # Approx chars per token (conservative for mixed Shona/English/emoji content)
143
+ CHARS_PER_TOKEN = 4
144
+ # Target ~10k tokens per chunk with ~1k token overlap
145
+ CHUNK_CHARS = 40_000
146
+ OVERLAP_CHARS = 4_000
147
+
148
+
149
  def _safe_json(text: str, fallback: Any) -> Any:
150
+ """
151
+ Multi-strategy JSON parser.
152
+ 1. Direct parse after stripping markdown fences.
153
+ 2. Regex extraction of first [...] or {...} block.
154
+ 3. Return fallback.
155
+ """
156
+ if not text:
157
+ return fallback
158
+
159
+ # Strategy 1: strip fences
160
+ clean = text.strip()
161
+ for fence in ("```json", "```JSON", "```"):
162
+ if fence in clean:
163
+ parts = clean.split(fence)
164
+ # take the content between the first pair of fences
165
+ if len(parts) >= 3:
166
+ clean = parts[1].strip()
167
+ elif len(parts) == 2:
168
+ clean = parts[1].split("```")[0].strip()
169
+ break
170
+
171
  try:
 
 
 
 
 
172
  return json.loads(clean)
173
+ except json.JSONDecodeError:
174
+ pass
175
+
176
+ # Strategy 2: regex — find outermost [...] array
177
+ arr_match = re.search(r'\[[\s\S]*\]', clean)
178
+ if arr_match:
179
+ try:
180
+ return json.loads(arr_match.group())
181
+ except json.JSONDecodeError:
182
+ pass
183
+
184
+ # Strategy 3: regex — find outermost {...} object
185
+ obj_match = re.search(r'\{[\s\S]*\}', clean)
186
+ if obj_match:
187
+ try:
188
+ return json.loads(obj_match.group())
189
+ except json.JSONDecodeError:
190
+ pass
191
+
192
+ logger.error("JSON parse exhausted all strategies. First 300 chars: %s", text[:300])
193
+ return fallback
194
+
195
+
196
+ def _validate_articles(data: Any) -> List[Dict]:
197
+ """
198
+ Validate that extracted articles are a list of dicts with required fields.
199
+ Filters out malformed items rather than failing the whole batch.
200
+ """
201
+ if not isinstance(data, list):
202
+ logger.warning("Expected list from Gemini, got %s", type(data))
203
+ return []
204
+ valid = []
205
+ for item in data:
206
+ if not isinstance(item, dict):
207
+ continue
208
+ title = str(item.get("title", "")).strip()
209
+ content = str(item.get("content", "")).strip()
210
+ if len(title) < 3 or len(content) < 10:
211
+ continue
212
+ valid.append({
213
+ "title": title,
214
+ "content": content,
215
+ "category": str(item.get("category", "General")).strip() or "General",
216
+ "tags": item.get("tags", []) if isinstance(item.get("tags"), list) else [],
217
+ })
218
+ return valid
219
 
220
 
221
  def _gemini_text(prompt: str, json_mode: bool = False) -> str:
222
+ """Call Gemini with text-only content."""
223
  if not _gemini_client:
224
  return ""
225
+ cfg = genai_types.GenerateContentConfig(
226
+ response_mime_type="application/json"
227
+ ) if json_mode else None
228
  try:
229
  resp = _gemini_client.models.generate_content(
230
  model=GEMINI_MODEL,
 
233
  )
234
  return resp.text or ""
235
  except Exception as e:
236
+ logger.error("Gemini text call error: %s", e)
237
+ return ""
238
+
239
+
240
+ def _gemini_multimodal(parts: list, json_mode: bool = False) -> str:
241
+ """Call Gemini with a mixed list of text strings and image Parts."""
242
+ if not _gemini_client:
243
+ return ""
244
+ cfg = genai_types.GenerateContentConfig(
245
+ response_mime_type="application/json"
246
+ ) if json_mode else None
247
+ try:
248
+ resp = _gemini_client.models.generate_content(
249
+ model=GEMINI_MODEL,
250
+ contents=parts,
251
+ config=cfg
252
+ )
253
+ return resp.text or ""
254
+ except Exception as e:
255
+ logger.error("Gemini multimodal call error: %s", e)
256
  return ""
257
 
258
 
259
  def _article_fingerprint(title: str, content: str) -> str:
 
260
  raw = f"{title.strip().lower()}::{content.strip().lower()[:300]}"
261
  return hashlib.sha256(raw.encode()).hexdigest()[:16]
262
 
263
 
264
  def _get_existing_fingerprints() -> set:
 
265
  if not db:
266
  return set()
267
  try:
 
273
 
274
 
275
  def _save_kb_articles(articles: List[Dict], source_label: str) -> Dict:
 
276
  if not db:
277
  return {"saved": 0, "skipped": 0, "error": "Firebase unavailable"}
 
278
  existing = _get_existing_fingerprints()
279
  saved, skipped = 0, 0
 
280
  for article in articles:
281
  title = article.get("title", "Untitled")
282
  content = article.get("content", "")
283
  fp = _article_fingerprint(title, content)
 
284
  if fp in existing:
285
  skipped += 1
286
  continue
 
287
  doc = {
288
  "title": title,
289
  "content": content,
 
293
  "fingerprint": fp,
294
  "created_at": datetime.now(timezone.utc).isoformat(),
295
  }
 
296
  if article.get("timestamp_start") is not None:
297
  doc["timestamp_start"] = article["timestamp_start"]
298
  doc["timestamp_end"] = article.get("timestamp_end")
299
  doc["video_url"] = article.get("video_url", "")
 
300
  db.collection("iris_kb_articles").add(doc)
301
  existing.add(fp)
302
  saved += 1
 
303
  return {"saved": saved, "skipped": skipped}
304
 
305
 
306
  # ══════════════════════════════════════════════════════════════════════════════
307
+ # WHATSAPP ZIP PROCESSOR
308
+ # ══════════════════════════════════════════════════════════════════════════════
309
+
310
+ # Regex to match WhatsApp timestamp lines
311
+ # Handles both: DD/MM/YYYY, HH:MM - Sender: message
312
+ # and: DD/MM/YYYY, HH:MM am/pm - Sender: message
313
+ WA_LINE_RE = re.compile(
314
+ r'^\d{1,2}/\d{1,2}/\d{4},\s+\d{1,2}:\d{2}(?:\s*[ap]m)?\s+-\s+',
315
+ re.IGNORECASE
316
+ )
317
+
318
+ # Matches <Media omitted> or [filename.jpg] style media pointers
319
+ MEDIA_POINTER_RE = re.compile(
320
+ r'<Media omitted>|\[?([^\]]+\.(?:jpg|jpeg|png|webp|gif|mp4|opus|aac|m4a))\]?',
321
+ re.IGNORECASE
322
+ )
323
+
324
+
325
+ class WhatsAppZipProcessor:
326
+ """
327
+ Handles extraction and multimodal chunking of a WhatsApp .zip export.
328
+
329
+ A WhatsApp export zip typically contains:
330
+ _chat.txt — the full conversation
331
+ IMG-YYYYMMDD-*.jpg — attached images
332
+ VID-*.mp4 — videos (we skip these, too large)
333
+ PTT-*.opus — voice notes (skipped)
334
+ """
335
+
336
+ def __init__(self, zip_bytes: bytes):
337
+ self.zip_bytes = zip_bytes
338
+ self.chat_text = ""
339
+ self.media_map: Dict[str, bytes] = {} # filename -> raw bytes
340
+
341
+ def extract(self) -> bool:
342
+ """Extract chat text and image files from ZIP. Returns True on success."""
343
+ try:
344
+ with zipfile.ZipFile(io.BytesIO(self.zip_bytes)) as zf:
345
+ names = zf.namelist()
346
+ logger.info("ZIP contains %d files: %s", len(names), names[:20])
347
+
348
+ # Find chat file — WhatsApp names it _chat.txt or WhatsApp Chat with *.txt
349
+ chat_file = None
350
+ for name in names:
351
+ base = os.path.basename(name).lower()
352
+ if base == "_chat.txt" or (base.endswith(".txt") and "chat" in base):
353
+ chat_file = name
354
+ break
355
+ if not chat_file:
356
+ # Fallback: any .txt file
357
+ txts = [n for n in names if n.lower().endswith(".txt")]
358
+ if txts:
359
+ chat_file = txts[0]
360
+
361
+ if not chat_file:
362
+ logger.error("No chat .txt found in ZIP")
363
+ return False
364
+
365
+ raw = zf.read(chat_file)
366
+ self.chat_text = raw.decode("utf-8", errors="replace")
367
+ logger.info("Chat text extracted: %d chars from %s", len(self.chat_text), chat_file)
368
+
369
+ # Extract images (skip videos and audio — too large / not useful for KB)
370
+ for name in names:
371
+ ext = os.path.splitext(name.lower())[1]
372
+ if ext in SUPPORTED_IMAGE_EXTS:
373
+ try:
374
+ self.media_map[os.path.basename(name)] = zf.read(name)
375
+ except Exception as e:
376
+ logger.warning("Could not read media file %s: %s", name, e)
377
+
378
+ logger.info("Media files extracted: %d images", len(self.media_map))
379
+ return True
380
+
381
+ except zipfile.BadZipFile as e:
382
+ logger.error("Bad ZIP file: %s", e)
383
+ return False
384
+ except Exception as e:
385
+ logger.error("ZIP extraction error: %s", e)
386
+ return False
387
+
388
+ def _resolve_media_in_line(self, line: str) -> Optional[bytes]:
389
+ """
390
+ Given a chat line, check if it references a media file we have.
391
+ Returns the image bytes if found, else None.
392
+ """
393
+ match = MEDIA_POINTER_RE.search(line)
394
+ if not match:
395
+ return None
396
+ filename = match.group(1) # group 1 = explicit filename, None for <Media omitted>
397
+ if filename:
398
+ fname = os.path.basename(filename)
399
+ if fname in self.media_map:
400
+ return self.media_map[fname]
401
+ # <Media omitted> — we can't recover the file since it wasn't exported
402
+ return None
403
+
404
+ def build_chunks(self) -> List[Dict]:
405
+ """
406
+ Split chat text into overlapping chunks, each annotated with
407
+ the image bytes found within that chunk.
408
+
409
+ Returns list of:
410
+ { "text": str, "images": [bytes, ...], "line_range": (start, end) }
411
+ """
412
+ lines = self.chat_text.splitlines()
413
+ chunks = []
414
+
415
+ i = 0
416
+ total = len(lines)
417
+ char_count = 0
418
+ chunk_lines: List[str] = []
419
+ chunk_images: List[bytes] = []
420
+
421
+ while i < total:
422
+ line = lines[i]
423
+ chunk_lines.append(line)
424
+ char_count += len(line) + 1 # +1 for newline
425
+
426
+ # Check if this line has an image we can include
427
+ img_bytes = self._resolve_media_in_line(line)
428
+ if img_bytes and len(chunk_images) < 5: # cap images per chunk
429
+ chunk_images.append(img_bytes)
430
+
431
+ if char_count >= CHUNK_CHARS or i == total - 1:
432
+ chunks.append({
433
+ "text": "\n".join(chunk_lines),
434
+ "images": chunk_images[:],
435
+ "line_range": (i - len(chunk_lines) + 1, i)
436
+ })
437
+ logger.info(
438
+ "Chunk %d: %d lines, %d chars, %d images",
439
+ len(chunks), len(chunk_lines), char_count, len(chunk_images)
440
+ )
441
+ # Overlap: keep last OVERLAP_CHARS worth of lines for next chunk
442
+ overlap_text = 0
443
+ overlap_start = len(chunk_lines) - 1
444
+ while overlap_start > 0 and overlap_text < OVERLAP_CHARS:
445
+ overlap_text += len(chunk_lines[overlap_start]) + 1
446
+ overlap_start -= 1
447
+ chunk_lines = chunk_lines[overlap_start:]
448
+ chunk_images = []
449
+ char_count = sum(len(l) + 1 for l in chunk_lines)
450
+ i += 1
451
+
452
+ logger.info("Total chunks: %d", len(chunks))
453
+ return chunks
454
+
455
+
456
+ # ══════════════════════════════════════════════════════════════════════════════
457
+ # WHATSAPP EXTRACTION PROMPT
458
  # ══════════════════════════════════════════════════════════════════════════════
459
 
460
+ WHATSAPP_EXTRACTION_PROMPT = """You are a support knowledge base curator for the Iris field staff management app used in Zimbabwe.
461
+
462
+ Your task: analyse this WhatsApp support group chat segment and extract ONLY clear problem→solution pairs.
463
 
464
+ CONTEXT ABOUT THIS APP:
465
+ - "Iris" is a mobile attendance/location tracking app used by field sales reps at retail stores.
466
+ - Common issues: GPS location not detected, clock-in failures, app killed by Android battery optimiser,
467
+ teller passkey problems, hours not recording correctly, store radius too small, wrong teller name shown.
468
+ - Messages mix English, Shona, and Ndebele. Understand regional vernacular (e.g. "irikudzima" = switching off,
469
+ "ndakashanda" = I worked, "short yemahours" = hours shortage, "gadzirisayi" = fix it through).
470
+ - If screenshots show Android error dialogs (e.g. "Service killed by system", "App stopped"), reason through
471
+ what that means for Android background restriction and include it in the solution.
472
 
473
+ STRICT RULES:
474
+ 1. Extract ONLY exchanges where a user described a problem AND a named support person (Tendayi, Tony, Violet,
475
+ Rufaro, Albrighton, Ishmael, or any named responder) provided a working solution or clear instruction.
476
+ 2. Ignore: greetings, media-only messages, deleted messages, clock-in screenshots with no text context,
477
+ messages from unknown numbers with no solution attached.
478
+ 3. Each article must be self-contained and usable by a support agent in future.
479
+ 4. Translate all Shona/Ndebele problem descriptions to English in the article content.
480
+ 5. If a screenshot appears to show an Android error or GPS issue, reason through the likely cause and
481
+ include that reasoning in the solution content.
482
 
483
+ OUTPUT FORMAT: Return ONLY a valid JSON array. No preamble, no explanation, no markdown fences.
484
+ Every string value MUST be properly JSON-escaped. Do not use unescaped newlines, tabs, or quotes inside strings.
485
+ Use \\n for line breaks within content strings.
 
 
 
 
486
 
487
+ Schema per item:
488
+ {"title": "string (max 80 chars)", "content": "string (escaped, solution steps)", "category": "one of: Account|Technical|Location|Attendance|Device|Other", "tags": ["array", "of", "strings"]}
489
 
490
+ If no valid problem→solution pairs exist in this segment, return an empty array: []
491
+
492
+ Chat segment:
493
  """
494
 
495
+
496
+ def _process_chunk_with_gemini(chunk: Dict) -> List[Dict]:
497
+ """
498
+ Send a single chunk (text + optional images) to Gemini.
499
+ Returns validated list of article dicts.
500
+ """
501
+ text_part = WHATSAPP_EXTRACTION_PROMPT + chunk["text"]
502
+ images = chunk.get("images", [])
503
+
504
+ if images and _gemini_client:
505
+ # Build multimodal content list
506
+ parts = [text_part]
507
+ for img_bytes in images:
508
+ # Detect mime type from magic bytes
509
+ mime = "image/jpeg"
510
+ if img_bytes[:4] == b'\x89PNG':
511
+ mime = "image/png"
512
+ elif img_bytes[:4] == b'RIFF':
513
+ mime = "image/webp"
514
+ parts.append(
515
+ genai_types.Part.from_bytes(data=img_bytes, mime_type=mime)
516
+ )
517
+ raw = _gemini_multimodal(parts, json_mode=True)
518
+ else:
519
+ raw = _gemini_text(text_part, json_mode=True)
520
+
521
+ if not raw:
522
+ logger.warning("Empty Gemini response for chunk")
523
+ return []
524
+
525
+ parsed = _safe_json(raw, [])
526
+ return _validate_articles(parsed)
527
+
528
+
529
+ # ══════════════════════════════════════════════════════════════════════════════
530
+ # FEATURE 1 — WhatsApp Export → Knowledge Base (v1.1: ZIP multimodal + chunked)
531
+ # ══════════════════════════════════════════════════════════════════════════════
532
+
533
  @app.post("/api/kb/whatsapp-import")
534
  def whatsapp_import():
535
  """
536
+ Accepts EITHER:
537
+ (a) multipart file upload with field "file" containing a .zip WhatsApp export, OR
538
+ (b) JSON body { "chat_text": "..." } for plain text (legacy support)
539
+
540
+ Processes in sliding-window chunks, sends images to Gemini multimodally.
541
+ Saves new articles only (additive, dedup by fingerprint).
542
  """
543
+ all_articles: List[Dict] = []
544
+ source_label = "whatsapp_export"
545
 
546
+ # ── Branch A: ZIP upload ──────────────────────────────────────────────────
547
+ if "file" in request.files:
548
+ f = request.files["file"]
549
+ filename = f.filename or ""
550
+ if not filename.lower().endswith(".zip"):
551
+ return jsonify({"ok": False, "error": "Expected a .zip WhatsApp export file"}), 400
552
 
553
+ zip_bytes = f.read()
554
+ logger.info("WhatsApp ZIP upload: %d bytes, filename=%s", len(zip_bytes), filename)
555
 
556
+ processor = WhatsAppZipProcessor(zip_bytes)
557
+ if not processor.extract():
558
+ return jsonify({"ok": False, "error": "Could not extract chat from ZIP. Ensure it is a valid WhatsApp export."}), 400
559
 
560
+ if len(processor.chat_text) < 100:
561
+ return jsonify({"ok": False, "error": "Extracted chat text too short to process"}), 400
562
 
563
+ chunks = processor.build_chunks()
564
+ source_label = f"whatsapp_zip:{filename}"
565
 
566
+ for idx, chunk in enumerate(chunks):
567
+ logger.info("Processing chunk %d/%d", idx + 1, len(chunks))
568
+ articles = _process_chunk_with_gemini(chunk)
569
+ all_articles.extend(articles)
570
+ logger.info("Chunk %d yielded %d articles (running total: %d)", idx + 1, len(articles), len(all_articles))
571
+
572
+ # ── Branch B: Legacy plain text JSON body ─────────────────────────────────
573
+ else:
574
+ body = request.get_json(silent=True) or {}
575
+ raw_chat = body.get("chat_text", "").strip()
576
+ if not raw_chat:
577
+ return jsonify({"ok": False, "error": "Provide a .zip file upload or chat_text in JSON body"}), 400
578
+ if len(raw_chat) < 100:
579
+ return jsonify({"ok": False, "error": "Chat text too short to process"}), 400
580
+
581
+ logger.info("WhatsApp plain text import: %d chars", len(raw_chat))
582
+
583
+ # Chunk the plain text too (handles large exports)
584
+ lines = raw_chat.splitlines()
585
+ pseudo_zip = type("PseudoZip", (), {
586
+ "chat_text": raw_chat,
587
+ "media_map": {}
588
+ })()
589
+ processor = WhatsAppZipProcessor(b"")
590
+ processor.chat_text = raw_chat
591
+ processor.media_map = {}
592
+ chunks = processor.build_chunks()
593
+
594
+ for idx, chunk in enumerate(chunks):
595
+ logger.info("Processing text chunk %d/%d", idx + 1, len(chunks))
596
+ articles = _process_chunk_with_gemini(chunk)
597
+ all_articles.extend(articles)
598
+
599
+ if not all_articles:
600
+ logger.info("No articles extracted from this export")
601
+ return jsonify({
602
+ "ok": True,
603
+ "articles_found": 0,
604
+ "saved": 0,
605
+ "skipped_dupes": 0,
606
+ "note": "No clear problem→solution pairs found in this chat segment"
607
+ })
608
+
609
+ stats = _save_kb_articles(all_articles, source_label=source_label)
610
+ logger.info("WhatsApp import complete: found=%d, %s", len(all_articles), stats)
611
 
612
  return jsonify({
613
+ "ok": True,
614
+ "articles_found": len(all_articles),
615
+ "saved": stats["saved"],
616
+ "skipped_dupes": stats["skipped"],
617
  })
618
 
619
 
 
622
  # ══════════════════════════════════════════════════════════════════════════════
623
 
624
  def _extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
 
625
  if PYPDF_AVAILABLE:
626
  try:
627
  reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
 
631
  return text
632
  except Exception as e:
633
  logger.warning("pypdf extraction failed: %s", e)
 
 
634
  if _gemini_client:
635
  try:
636
+ resp = _gemini_client.models.generate_content(
 
637
  model=GEMINI_MODEL,
638
  contents=[
639
  "Extract all text from this PDF document. Return plain text only.",
 
646
  return ""
647
 
648
 
649
+ PDF_KB_PROMPT = """You are a support knowledge base curator.
 
650
  Convert the following document content into structured KB articles.
651
+ Each article covers one distinct topic, issue, or procedure.
652
 
653
+ Return ONLY a valid JSON array no preamble, no markdown fences.
654
+ All string values must be properly JSON-escaped (no raw newlines inside strings, use \\n).
 
 
 
 
 
655
 
656
+ Schema per item:
657
+ {"title": "string", "content": "string", "category": "one of: Account|Billing|Technical|Feature|Other", "tags": ["string"]}
658
 
659
  Document content:
660
  """
661
 
662
  @app.post("/api/kb/bulk-upload")
663
  def bulk_upload():
 
 
 
 
 
664
  if "file" not in request.files:
665
  return jsonify({"ok": False, "error": "No file uploaded"}), 400
 
666
  f = request.files["file"]
667
  filename = f.filename or ""
668
  ext = filename.rsplit(".", 1)[-1].lower()
669
  file_data = f.read()
670
+ articles = []
 
671
 
672
  if ext in ("csv", "xlsx", "xls"):
673
  if not PANDAS_AVAILABLE:
674
  return jsonify({"ok": False, "error": "pandas not installed on server"}), 500
675
  try:
676
+ df = pd.read_csv(io.BytesIO(file_data)) if ext == "csv" else pd.read_excel(io.BytesIO(file_data))
 
 
 
 
677
  df.columns = [c.strip().lower() for c in df.columns]
 
678
  if "title" not in df.columns or "content" not in df.columns:
679
  return jsonify({"ok": False, "error": "CSV/Excel must have 'title' and 'content' columns"}), 400
 
680
  for _, row in df.iterrows():
681
  tags = []
682
  if "tags" in df.columns and pd.notna(row.get("tags")):
683
+ tags = [t.strip() for t in re.split(r"[,;|]", str(row["tags"])) if t.strip()]
 
 
684
  articles.append({
685
  "title": str(row["title"]).strip(),
686
  "content": str(row["content"]).strip(),
 
688
  "tags": tags,
689
  })
690
  except Exception as e:
 
691
  return jsonify({"ok": False, "error": f"Could not parse file: {e}"}), 400
692
 
693
  elif ext == "pdf":
694
  text = _extract_text_from_pdf_bytes(file_data)
695
  if not text:
696
  return jsonify({"ok": False, "error": "Could not extract text from PDF"}), 400
697
+ raw = _gemini_text(PDF_KB_PROMPT + text[:50000], json_mode=True)
698
+ parsed = _safe_json(raw, [])
699
+ articles = _validate_articles(parsed)
700
+ if not articles:
701
+ return jsonify({"ok": False, "error": "Gemini PDF structuring returned no valid articles"}), 500
 
702
  else:
703
+ return jsonify({"ok": False, "error": f"Unsupported file type .{ext}. Use csv, xlsx, or pdf"}), 400
704
 
705
  if not articles:
706
  return jsonify({"ok": False, "error": "No articles extracted from file"}), 400
707
 
708
  stats = _save_kb_articles(articles, source_label=f"bulk_upload:{filename}")
709
+ return jsonify({"ok": True, "articles_found": len(articles), "saved": stats["saved"], "skipped_dupes": stats["skipped"]})
 
 
 
 
 
710
 
711
 
712
  # ══════════════════════════════════════════════════════════════════════════════
713
  # FEATURE 3 — Ticket Submission via NL Text or Voice
714
  # ══════════════════════════════════════════════════════════════════════════════
715
 
716
+ TICKET_EXTRACTION_PROMPT = """You are a support ticket intake system for a software support portal.
 
717
 
718
  A user has described their issue in natural language. Extract structured ticket fields.
719
 
720
+ Return ONLY a valid JSON object — no preamble, no markdown fences.
721
+ All string values must be properly JSON-escaped.
 
 
 
 
 
 
722
 
723
+ Schema:
724
+ {"title": "string (max 80 chars)", "description": "string (full clear description)", "category_hint": "one of: Account|Billing|Technical|Feature|Other", "priority_hint": "one of: low|medium|high|critical", "keywords": ["string"]}
725
+
726
+ User message:
727
  """
728
 
729
  def _transcribe_audio_assemblyai(audio_b64: str, audio_format: str = "wav") -> str:
 
730
  if not ASSEMBLYAI_API_KEY:
731
  return ""
 
732
  audio_bytes = base64.b64decode(audio_b64)
733
  headers = {"authorization": ASSEMBLYAI_API_KEY}
 
 
734
  try:
735
  upload_resp = requests.post(
736
  f"{ASSEMBLYAI_BASE}/upload",
737
  headers={**headers, "Content-Type": "application/octet-stream"},
738
+ data=audio_bytes, timeout=30
 
739
  )
740
  upload_resp.raise_for_status()
741
  upload_url = upload_resp.json().get("upload_url")
742
  except Exception as e:
743
  logger.error("AssemblyAI upload error: %s", e)
744
  return ""
 
 
745
  try:
746
  tx_resp = requests.post(
747
  f"{ASSEMBLYAI_BASE}/transcript",
748
  headers={**headers, "Content-Type": "application/json"},
749
+ json={"audio_url": upload_url, "language_detection": True}, timeout=15
 
750
  )
751
  tx_resp.raise_for_status()
752
  tx_id = tx_resp.json().get("id")
753
  except Exception as e:
754
  logger.error("AssemblyAI transcript request error: %s", e)
755
  return ""
 
 
756
  for _ in range(30):
757
  time.sleep(3)
758
  try:
759
+ poll = requests.get(f"{ASSEMBLYAI_BASE}/transcript/{tx_id}", headers=headers, timeout=15)
 
 
 
 
760
  poll.raise_for_status()
761
  result = poll.json()
762
  status = result.get("status")
 
772
 
773
  @app.post("/api/tickets/submit-nl")
774
  def submit_ticket_nl():
 
 
 
 
775
  body = request.get_json(silent=True) or {}
776
  message = body.get("message", "").strip()
777
  user_id = body.get("user_id", "anonymous")
 
778
  if not message:
779
  return jsonify({"ok": False, "error": "message is required"}), 400
780
+ raw = _gemini_text(TICKET_EXTRACTION_PROMPT + message, json_mode=True)
781
+ ticket = _safe_json(raw, {})
782
+ if not isinstance(ticket, dict) or not ticket.get("title"):
 
 
783
  return jsonify({"ok": False, "error": "Could not extract ticket info from message"}), 500
 
 
784
  if db:
785
  db.collection("iris_ai_ticket_drafts").add({
786
+ "user_id": user_id, "raw_input": message,
787
+ "extracted": ticket, "channel": "nl_text",
 
 
788
  "created_at": datetime.now(timezone.utc).isoformat(),
789
  })
 
790
  return jsonify({"ok": True, "ticket": ticket})
791
 
792
 
793
  @app.post("/api/tickets/submit-voice")
794
  def submit_ticket_voice():
 
 
 
 
795
  body = request.get_json(silent=True) or {}
796
  audio_b64 = body.get("audio_b64", "")
797
  audio_format = body.get("audio_format", "wav")
798
  user_id = body.get("user_id", "anonymous")
 
799
  if not audio_b64:
800
  return jsonify({"ok": False, "error": "audio_b64 is required"}), 400
 
801
  if not ASSEMBLYAI_API_KEY:
802
  return jsonify({"ok": False, "error": "AssemblyAI not configured on server"}), 500
 
 
803
  transcript = _transcribe_audio_assemblyai(audio_b64, audio_format)
 
804
  if not transcript:
805
  return jsonify({"ok": False, "error": "Transcription failed or returned empty result"}), 500
806
+ raw = _gemini_text(TICKET_EXTRACTION_PROMPT + transcript, json_mode=True)
807
+ ticket = _safe_json(raw, {})
808
+ if not isinstance(ticket, dict) or not ticket.get("title"):
 
 
809
  return jsonify({"ok": False, "error": "Could not extract ticket info from transcript"}), 500
 
810
  if db:
811
  db.collection("iris_ai_ticket_drafts").add({
812
+ "user_id": user_id, "raw_input": transcript,
813
+ "extracted": ticket, "channel": "voice",
 
 
814
  "created_at": datetime.now(timezone.utc).isoformat(),
815
  })
 
816
  return jsonify({"ok": True, "transcript": transcript, "ticket": ticket})
817
 
818
 
 
820
  # FEATURE 4 — System Tutorial Ingestion
821
  # ══════════════════════════════════════════════════════════════════════════════
822
 
823
+ TUTORIAL_EXTRACTION_PROMPT = """You are a knowledge base curator for a software support system.
 
824
 
825
+ You have a timestamped transcript from a tutorial video about the Iris Support Portal.
826
+ Extract one KB article per distinct feature or task demonstrated.
827
 
828
+ Return ONLY a valid JSON array no preamble, no markdown fences.
829
+ All strings must be properly JSON-escaped.
830
 
831
+ Schema per item:
832
+ {"title": "string", "content": "string (step-by-step instructions)", "category": "one of: Account|Tickets|Agents|Reports|Admin|Other", "tags": ["string"], "timestamp_start": <integer seconds>, "timestamp_end": <integer seconds>}
 
 
 
 
 
 
 
833
 
834
+ Transcript:
 
 
835
  """
836
 
837
  def _parse_timestamp_to_seconds(ts: str) -> int:
838
+ parts = str(ts).strip("[]").split(":")
 
839
  try:
840
  if len(parts) == 2:
841
  return int(parts[0]) * 60 + int(parts[1])
 
845
  pass
846
  return 0
847
 
 
848
  @app.post("/api/kb/tutorial-ingest")
849
  def tutorial_ingest():
 
 
 
 
 
 
 
 
850
  body = request.get_json(silent=True) or {}
851
  transcript = body.get("transcript", "").strip()
852
  video_url = body.get("video_url", "")
853
  video_title = body.get("video_title", "Tutorial")
 
854
  if not transcript:
855
  return jsonify({"ok": False, "error": "transcript is required"}), 400
856
+ raw = _gemini_text(TUTORIAL_EXTRACTION_PROMPT + transcript[:50000], json_mode=True)
857
+ parsed = _safe_json(raw, [])
858
+ articles = _validate_articles(parsed) if isinstance(parsed, list) else []
859
+ if not articles:
860
+ return jsonify({"ok": False, "error": "Gemini returned no valid articles from transcript"}), 500
 
 
 
 
 
861
  for a in articles:
862
  a["video_url"] = video_url
863
  a["video_title"] = video_title
 
864
  for ts_key in ("timestamp_start", "timestamp_end"):
865
  val = a.get(ts_key)
866
  if isinstance(val, str):
867
  a[ts_key] = _parse_timestamp_to_seconds(val)
868
  elif not isinstance(val, int):
869
  a[ts_key] = 0
 
870
  stats = _save_kb_articles(articles, source_label=f"tutorial:{video_title}")
871
+ return jsonify({"ok": True, "video_title": video_title, "articles_found": len(articles), "saved": stats["saved"], "skipped_dupes": stats["skipped"]})
 
 
 
 
 
 
 
872
 
873
 
874
  # ══════════════════════════════════════════════════════════════════════════════
875
  # FEATURE 5 — Agent Solution Writing (NL Text + Voice)
876
  # ══════════════════════════════════════════════════════════════════════════════
877
 
878
+ SOLUTION_EXTRACTION_PROMPT = """You are a support knowledge base curator.
879
+ An agent has described a solution they used to resolve a ticket.
 
 
880
  Structure this into a reusable KB article.
881
 
882
+ Return ONLY a valid JSON object — no preamble, no markdown fences.
883
+ All strings must be properly JSON-escaped.
884
+
885
+ Schema:
886
+ {"title": "string", "content": "string (clear step-by-step solution)", "category": "one of: Account|Billing|Technical|Feature|Other", "tags": ["string"]}
 
 
887
 
888
+ Agent description:
889
  """
890
 
891
  @app.post("/api/kb/agent-solution-nl")
892
  def agent_solution_nl():
 
 
 
 
893
  body = request.get_json(silent=True) or {}
894
  message = body.get("message", "").strip()
895
  agent_id = body.get("agent_id", "unknown")
896
  ticket_id = body.get("ticket_id", "")
 
897
  if not message:
898
  return jsonify({"ok": False, "error": "message is required"}), 400
899
+ raw = _gemini_text(SOLUTION_EXTRACTION_PROMPT + message, json_mode=True)
900
+ article = _safe_json(raw, {})
901
+ if not isinstance(article, dict) or not article.get("title"):
 
 
902
  return jsonify({"ok": False, "error": "Could not structure solution"}), 500
 
 
903
  if ticket_id:
904
  article.setdefault("tags", []).append(f"ticket:{ticket_id}")
 
905
  stats = _save_kb_articles([article], source_label=f"agent:{agent_id}")
906
+ return jsonify({"ok": True, "saved": stats["saved"], "article": article})
 
 
 
 
 
907
 
908
 
909
  @app.post("/api/kb/agent-solution-voice")
910
  def agent_solution_voice():
 
 
 
 
911
  body = request.get_json(silent=True) or {}
912
  audio_b64 = body.get("audio_b64", "")
913
  audio_format = body.get("audio_format", "wav")
914
  agent_id = body.get("agent_id", "unknown")
915
  ticket_id = body.get("ticket_id", "")
 
916
  if not audio_b64:
917
  return jsonify({"ok": False, "error": "audio_b64 is required"}), 400
 
918
  transcript = _transcribe_audio_assemblyai(audio_b64, audio_format)
 
919
  if not transcript:
920
  return jsonify({"ok": False, "error": "Transcription failed"}), 500
921
+ raw = _gemini_text(SOLUTION_EXTRACTION_PROMPT + transcript, json_mode=True)
922
+ article = _safe_json(raw, {})
923
+ if not isinstance(article, dict) or not article.get("title"):
 
 
924
  return jsonify({"ok": False, "error": "Could not structure solution from transcript"}), 500
 
925
  if ticket_id:
926
  article.setdefault("tags", []).append(f"ticket:{ticket_id}")
 
927
  stats = _save_kb_articles([article], source_label=f"agent:{agent_id}")
928
+ return jsonify({"ok": True, "transcript": transcript, "saved": stats["saved"], "article": article})
 
 
 
 
 
 
929
 
930
 
931
  # ══════════════════════════════════════════════════════════════════════════════
932
+ # FEATURE 6 — Iris Chatbot (RAG over KB + Tutorials)
933
  # ══════════════════════════════════════════════════════════════════════════════
934
 
935
  def _search_kb(query: str, limit: int = 5) -> List[Dict]:
 
 
 
 
936
  if not db:
937
  return []
 
938
  query_terms = [t.lower() for t in query.split() if len(t) > 2]
 
939
  try:
 
940
  docs = db.collection("iris_kb_articles").order_by(
941
  "created_at", direction=firestore.Query.DESCENDING
942
  ).limit(200).stream()
 
943
  results = []
944
  for doc in docs:
945
  d = doc.to_dict()
 
947
  score = sum(1 for term in query_terms if term in text)
948
  if score > 0:
949
  results.append({"score": score, **d})
 
950
  results.sort(key=lambda x: x["score"], reverse=True)
951
  return results[:limit]
 
952
  except Exception as e:
953
  logger.error("KB search error: %s", e)
954
  return []
955
 
956
 
957
+ CHATBOT_SYSTEM_PROMPT = """You are Iris, an intelligent support assistant for the Iris Support Portal.
 
958
 
959
+ Answer ONLY from the provided knowledge base context.
960
+ If the answer is in a tutorial with a timestamp, mention the video and timestamp.
961
+ Be concise, clear, and friendly. Format step-by-step answers as numbered lists.
962
+ If you cannot find the answer, say so honestly and suggest submitting a ticket.
 
 
 
 
963
  """
964
 
965
  @app.post("/api/chatbot/query")
966
  def chatbot_query():
 
 
 
 
 
 
 
 
967
  body = request.get_json(silent=True) or {}
968
  message = body.get("message", "").strip()
969
  session_id = body.get("session_id", "default")
970
  user_id = body.get("user_id", "anonymous")
 
971
  if not message:
972
  return jsonify({"ok": False, "error": "message is required"}), 400
 
 
973
  kb_results = _search_kb(message, limit=5)
 
974
  context_blocks = []
975
  sources = []
976
  for r in kb_results:
977
  block = f"[Article: {r.get('title')}]\n{r.get('content', '')}"
978
  if r.get("timestamp_start") is not None:
979
+ ts = r["timestamp_start"]
980
+ block += f"\n(Tutorial: {r.get('video_title','Video')} at {ts//60:02d}:{ts%60:02d}"
981
+ if r.get("video_url"):
982
+ block += f" — {r['video_url']}"
983
+ block += ")"
 
984
  context_blocks.append(block)
985
  sources.append({
986
  "title": r.get("title"),
 
989
  "ts_start": r.get("timestamp_start"),
990
  "video_url": r.get("video_url"),
991
  })
992
+ context_str = "\n\n---\n\n".join(context_blocks) if context_blocks else "No relevant articles found."
993
+ full_prompt = f"{CHATBOT_SYSTEM_PROMPT}\n\nKNOWLEDGE BASE CONTEXT:\n{context_str}\n\nUSER QUESTION: {message}\n\nAnswer:"
994
+ answer = _gemini_text(full_prompt)
 
 
 
 
 
 
 
 
 
 
 
995
  if not answer:
996
+ answer = "Sorry, I could not process your question right now. Please try again or submit a support ticket."
 
 
997
  if db:
998
  db.collection("iris_chatbot_logs").add({
999
+ "user_id": user_id, "session_id": session_id,
1000
+ "message": message, "answer": answer, "sources": sources,
 
 
 
1001
  "created_at": datetime.now(timezone.utc).isoformat(),
1002
  })
1003
+ return jsonify({"ok": True, "answer": answer, "sources": sources})
 
 
 
 
 
1004
 
1005
 
1006
  # ══════════════════════════════════════════════════════════════════════════════
1007
+ # KB READ / DELETE ENDPOINTS
1008
  # ══════════════════════════════════════════════════════════════════════════════
1009
 
1010
  @app.get("/api/kb/articles")
1011
  def list_kb_articles():
 
 
 
 
1012
  category = request.args.get("category", "")
1013
  limit = int(request.args.get("limit", 50))
 
1014
  if not db:
1015
  return jsonify({"ok": False, "error": "Firebase unavailable"}), 500
 
1016
  try:
1017
+ query = db.collection("iris_kb_articles").order_by("created_at", direction=firestore.Query.DESCENDING)
 
 
1018
  if category:
1019
  query = query.where("category", "==", category)
 
1020
  docs = query.limit(limit).stream()
1021
  articles = [{"id": d.id, **d.to_dict()} for d in docs]
1022
  return jsonify({"ok": True, "articles": articles, "count": len(articles)})
1023
  except Exception as e:
 
1024
  return jsonify({"ok": False, "error": str(e)}), 500
1025
 
1026
 
1027
  @app.delete("/api/kb/articles/<article_id>")
1028
  def delete_kb_article(article_id: str):
 
1029
  if not db:
1030
  return jsonify({"ok": False, "error": "Firebase unavailable"}), 500
1031
  try:
 
1048
  article_count = docs[0][0].value
1049
  except Exception:
1050
  pass
 
1051
  return jsonify({
1052
+ "ok": True,
1053
+ "service": "Iris AI Service v1.1",
1054
+ "model": GEMINI_MODEL,
1055
+ "gemini": bool(_gemini_client),
1056
+ "assemblyai": bool(ASSEMBLYAI_API_KEY),
1057
+ "firebase": bool(db),
1058
+ "kb_articles": article_count,
1059
  })
1060
 
1061
 
 
1065
 
1066
  if __name__ == "__main__":
1067
  port = int(os.environ.get("PORT", 7860))
1068
+ logger.info("Iris AI Service v1.1 starting on port %d (model=%s)", port, GEMINI_MODEL)
1069
  app.run(host="0.0.0.0", port=port)