""" Artifact generation: report, quiz, podcast (transcript + mp3). Uses Gemini API only for text; grounded in retrieved chunks with [1], [2] citations. Citations block is built from chunk metadata (never trust model-made bibliographies). """ import time from typing import Any, Dict, List, Optional, Set from backend.config import ( ARTIFACT_PODCAST, ARTIFACT_QUIZ, ARTIFACT_REPORT, TOP_K, ) from backend.gemini_client import ( build_citations_block, generate_with_gemini, is_gemini_error_response, parse_citation_numbers, ) from backend.retriever import retrieve from backend.storage import ( artifacts_index_path, podcasts_dir, quizzes_dir, reports_dir, ) from backend.utils import new_uuid, read_json, write_json, logger from backend import tts as tts_module # Artifact-specific retrieval queries (used to fetch context for Gemini) REPORT_QUERY = "Summarize and explain the most important concepts" QUIZ_QUERY = "Create assessment questions covering the key concepts" PODCAST_QUERY = "Create an engaging conversational explanation of key concepts" # Max tokens per artifact type (Gemini) MAX_TOKENS_REPORT = 2000 MAX_TOKENS_QUIZ = 2500 MAX_TOKENS_PODCAST = 3000 def _now_iso() -> str: from datetime import datetime return datetime.utcnow().isoformat() + "Z" def _retrieve_chunks( username: str, notebook_id: str, query: str, extra_instruction: str, strategy: str = "similarity", ) -> List[dict]: """Retrieve chunks (only from enabled sources). Query + extra_instruction used for retrieval.""" combined = f"{query}. {extra_instruction}".strip() if extra_instruction else query chunks, _ = retrieve(username, notebook_id, combined, top_k=TOP_K * 2, strategy=strategy) return chunks def _build_numbered_context(chunks: List[dict]) -> str: """Build context as [1] (source_name, page/slide/url), [2] ..., for Gemini.""" parts = [] for i, c in enumerate(chunks, start=1): meta = c.get("metadata", {}) or {} name = meta.get("source_name", "Source") loc = meta.get("page_or_slide", "") if loc == "web": loc_label = "url" elif loc: loc_label = f"page {loc}" else: loc_label = "—" doc = (c.get("document") or "").strip() parts.append(f"[{i}] {doc}\n(Source: {name}, {loc_label})") return "\n\n---\n\n".join(parts) def _append_citations_to_markdown(body: str, used_numbers: Set[int], chunks: List[dict]) -> str: """Append Citations section to markdown; we map [n] from chunk metadata (never trust model bib).""" block = build_citations_block(used_numbers, chunks) if not block: return body.rstrip() return body.rstrip() + "\n\n" + block def generate_report( username: str, notebook_id: str, extra_instruction: str = "", strategy: str = "similarity", ) -> Dict[str, Any]: """Generate report via Gemini. Context-only; citations [1], [2] from retrieved chunks.""" chunks = _retrieve_chunks(username, notebook_id, REPORT_QUERY, extra_instruction, strategy) if not chunks: return {"error": "No sources to generate report from. Add and enable sources in this notebook."} numbered_ctx = _build_numbered_context(chunks) system = ( "You write clear, structured reports in Markdown. You may ONLY use the provided numbered context. " "Do not invent facts. If the context is insufficient to answer a point, say so. " "Use citations by referencing the numbered chunks: [1], [2], etc. at the end of the sentence or paragraph. " "Output must include: a short executive summary, main sections with headings and bullet points, and key takeaways. " "Use only the given numbers that correspond to the context chunks." ) user = f"Context (cite using [1], [2], ...):\n\n{numbered_ctx}\n\nWrite a structured report in Markdown with executive summary, headings, bullet points, and key takeaways. Cite sources with [n]." if extra_instruction: user += f"\n\nAdditional instruction: {extra_instruction}" t0 = time.perf_counter() raw = generate_with_gemini(system, user, max_output_tokens=MAX_TOKENS_REPORT) generation_time = time.perf_counter() - t0 if is_gemini_error_response(raw): return {"error": raw} used = parse_citation_numbers(raw) report_md = _append_citations_to_markdown(raw, used, chunks) artifact_id = new_uuid() reports_dir_path = reports_dir(username, notebook_id) filename = f"report_{artifact_id[:8]}.md" path = reports_dir_path / filename path.write_text(report_md, encoding="utf-8") entry = { "id": artifact_id, "type": ARTIFACT_REPORT, "filename": filename, "created_at": _now_iso(), "prompt": extra_instruction, "retrieval_strategy": strategy, } _append_artifact_index(username, notebook_id, entry) return {"path": str(path), "filename": filename, "content": report_md, "entry": entry, "generation_time": generation_time} def generate_quiz( username: str, notebook_id: str, extra_instruction: str = "", strategy: str = "similarity", ) -> Dict[str, Any]: """Generate quiz via Gemini. 10–15 questions mixed MCQ + short answer; ANSWER KEY section.""" chunks = _retrieve_chunks(username, notebook_id, QUIZ_QUERY, extra_instruction, strategy) if not chunks: return {"error": "No sources to generate quiz from. Add and enable sources in this notebook."} numbered_ctx = _build_numbered_context(chunks) system = ( "You create quizzes in Markdown. You may ONLY use the provided numbered context. " "Do not invent facts. If context is insufficient, say so. " "Cite sources with [1], [2], etc. when a question or answer comes from a chunk. " "Include 10–15 questions: a mix of multiple choice (MCQ) and short answer. " "At the end, include a section: ## Answer Key with answers for all questions. " "Use only the given numbers that correspond to the context chunks." ) user = f"Context (cite using [1], [2], ...):\n\n{numbered_ctx}\n\nCreate a quiz in Markdown: 10–15 questions (MCQ + short answer), then ## Answer Key." if extra_instruction: user += f"\n\nAdditional instruction: {extra_instruction}" t0 = time.perf_counter() raw = generate_with_gemini(system, user, max_output_tokens=MAX_TOKENS_QUIZ) generation_time = time.perf_counter() - t0 if is_gemini_error_response(raw): return {"error": raw} used = parse_citation_numbers(raw) quiz_md = _append_citations_to_markdown(raw, used, chunks) artifact_id = new_uuid() quizzes_dir_path = quizzes_dir(username, notebook_id) filename = f"quiz_{artifact_id[:8]}.md" path = quizzes_dir_path / filename path.write_text(quiz_md, encoding="utf-8") entry = { "id": artifact_id, "type": ARTIFACT_QUIZ, "filename": filename, "created_at": _now_iso(), "prompt": extra_instruction, "retrieval_strategy": strategy, } _append_artifact_index(username, notebook_id, entry) return {"path": str(path), "filename": filename, "content": quiz_md, "entry": entry, "generation_time": generation_time} def generate_podcast( username: str, notebook_id: str, extra_instruction: str = "", strategy: str = "similarity", ) -> Dict[str, Any]: """Generate podcast transcript via Gemini (2 speakers, 4–8 min of text); then TTS for .mp3.""" chunks = _retrieve_chunks(username, notebook_id, PODCAST_QUERY, extra_instruction, strategy) if not chunks: return {"error": "No sources to generate podcast from. Add and enable sources in this notebook."} numbered_ctx = _build_numbered_context(chunks) system = ( "You write a 2-speaker podcast transcript. You may ONLY use the provided numbered context. " "Do not invent facts. Use 'Speaker A:' and 'Speaker B:' before each line. " "Natural dialogue; 4–8 minutes worth of text when read aloud (roughly 600–1200 words). " "Include occasional citations like [3] when referring to a source. " "Use only the given numbers that correspond to the context chunks." ) user = f"Context (cite using [1], [2], ...):\n\n{numbered_ctx}\n\nWrite a 2-speaker podcast transcript (Speaker A:, Speaker B:). 4–8 minutes of dialogue, grounded in the context, with occasional [n] citations." if extra_instruction: user += f"\n\nAdditional instruction: {extra_instruction}" t0 = time.perf_counter() raw = generate_with_gemini(system, user, max_output_tokens=MAX_TOKENS_PODCAST) generation_time = time.perf_counter() - t0 if is_gemini_error_response(raw): return {"error": raw} used = parse_citation_numbers(raw) transcript_md = _append_citations_to_markdown(raw, used, chunks) artifact_id = new_uuid() podcasts_dir_path = podcasts_dir(username, notebook_id) transcript_filename = f"transcript_{artifact_id[:8]}.md" transcript_path = podcasts_dir_path / transcript_filename transcript_path.write_text(transcript_md, encoding="utf-8") mp3_filename = f"podcast_{artifact_id[:8]}.mp3" mp3_path = podcasts_dir_path / mp3_filename success = tts_module.text_to_speech(transcript_md[:5000], mp3_path, lang="en") if not success: logger.warning("TTS failed for podcast %s", artifact_id) entry = { "id": artifact_id, "type": ARTIFACT_PODCAST, "filename": mp3_filename, "transcript_filename": transcript_filename, "created_at": _now_iso(), "prompt": extra_instruction, "retrieval_strategy": strategy, } _append_artifact_index(username, notebook_id, entry) return { "path": str(mp3_path), "filename": mp3_filename, "transcript_path": str(transcript_path), "transcript_content": transcript_md, "entry": entry, "audio_ok": success, "generation_time": generation_time, } def _append_artifact_index(username: str, notebook_id: str, entry: Dict[str, Any]) -> None: path = artifacts_index_path(username, notebook_id) data = read_json(path, default={"artifacts": []}) data.setdefault("artifacts", []).append(entry) write_json(path, data) def list_artifacts(username: str, notebook_id: str) -> List[Dict[str, Any]]: path = artifacts_index_path(username, notebook_id) data = read_json(path, default={"artifacts": []}) return list(data.get("artifacts", [])[::-1]) def get_report_content(username: str, notebook_id: str, filename: str) -> str: path = reports_dir(username, notebook_id) / filename if not path.exists(): return "" return path.read_text(encoding="utf-8") def get_quiz_content(username: str, notebook_id: str, filename: str) -> str: path = quizzes_dir(username, notebook_id) / filename if not path.exists(): return "" return path.read_text(encoding="utf-8") def get_podcast_transcript(username: str, notebook_id: str, transcript_filename: str) -> str: path = podcasts_dir(username, notebook_id) / transcript_filename if not path.exists(): return "" return path.read_text(encoding="utf-8") def get_podcast_audio_path(username: str, notebook_id: str, mp3_filename: str) -> Optional[str]: path = podcasts_dir(username, notebook_id) / mp3_filename if not path.exists(): return None return str(path)