Hitakshi26 commited on
Commit
b4c7867
·
1 Parent(s): e0ae835

NotebookLM Clone

Browse files
README.md CHANGED
@@ -1,12 +0,0 @@
1
- ---
2
- title: GPP1
3
- emoji: 🏆
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 6.8.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from src.frontend.ui import build_app
2
+
3
+ demo = build_app()
4
+
5
+ if __name__ == "__main__":
6
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio[oauth]==4.44.1
2
+ chromadb==0.5.5
3
+ sentence-transformers==3.0.1
4
+ pypdf==4.3.1
5
+ python-pptx==1.0.2
6
+ beautifulsoup4==4.12.3
7
+ requests==2.32.3
8
+ gTTS==2.5.3
9
+ huggingface_hub==0.24.6
10
+
11
+ #hugging face token = REMOVED_SECRET
src/backend/__pycache__/artifacts.cpython-310.pyc ADDED
Binary file (2.12 kB). View file
 
src/backend/__pycache__/auth.cpython-310.pyc ADDED
Binary file (461 Bytes). View file
 
src/backend/__pycache__/ingest.cpython-310.pyc ADDED
Binary file (4.94 kB). View file
 
src/backend/__pycache__/llm.cpython-310.pyc ADDED
Binary file (841 Bytes). View file
 
src/backend/__pycache__/notebooks.cpython-310.pyc ADDED
Binary file (1.9 kB). View file
 
src/backend/__pycache__/rag.cpython-310.pyc ADDED
Binary file (2.36 kB). View file
 
src/backend/artifacts.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from gtts import gTTS
3
+ from src.backend.llm import llm_generate
4
+ from src.backend.rag import format_sources, context_block
5
+
6
+ def generate_report(topic: str, hits, extra_prompt: str):
7
+ prompt = f"""
8
+ Write a markdown study report grounded ONLY in the sources.
9
+ Every non-trivial claim must include citations like [S1].
10
+
11
+ Topic: {topic}
12
+ Extra instructions: {extra_prompt or "(none)"}
13
+
14
+ Sources list:
15
+ {format_sources(hits)}
16
+
17
+ Excerpts:
18
+ {context_block(hits)}
19
+
20
+ Output:
21
+ # Report
22
+ ## Key Concepts
23
+ ## Detailed Notes
24
+ ## Key Takeaways
25
+ """
26
+ return llm_generate(prompt, max_new_tokens=900, temperature=0.25)
27
+
28
+ def generate_quiz(topic: str, hits, extra_prompt: str):
29
+ prompt = f"""
30
+ Write a markdown quiz grounded ONLY in the sources.
31
+ Create 8 questions:
32
+ - 5 multiple choice
33
+ - 3 short answer
34
+ Then include an Answer Key with explanations.
35
+ Explanations must include citations like [S1].
36
+
37
+ Topic: {topic}
38
+ Extra instructions: {extra_prompt or "(none)"}
39
+
40
+ Sources list:
41
+ {format_sources(hits)}
42
+
43
+ Excerpts:
44
+ {context_block(hits)}
45
+
46
+ Output:
47
+ # Quiz
48
+ ## Questions
49
+ ## Answer Key
50
+ """
51
+ return llm_generate(prompt, max_new_tokens=900, temperature=0.25)
52
+
53
+ def generate_podcast_transcript(topic: str, hits, extra_prompt: str):
54
+ prompt = f"""
55
+ Write a markdown podcast transcript grounded ONLY in the sources.
56
+ Two speakers: Speaker 1 and Speaker 2.
57
+ Every non-trivial claim must include citations like [S1].
58
+
59
+ Topic: {topic}
60
+ Extra instructions: {extra_prompt or "(none)"}
61
+
62
+ Sources list:
63
+ {format_sources(hits)}
64
+
65
+ Excerpts:
66
+ {context_block(hits)}
67
+
68
+ Output:
69
+ # Podcast Transcript
70
+ **Speaker 1:** ...
71
+ **Speaker 2:** ...
72
+ End with Sources section.
73
+ """
74
+ return llm_generate(prompt, max_new_tokens=900, temperature=0.3)
75
+
76
+ def transcript_to_mp3(transcript_md: str, out_path: str):
77
+ text = re.sub(r"\[(S\d+)\]", "", transcript_md)
78
+ text = re.sub(r"#+", "", text)
79
+ text = re.sub(r"\*\*", "", text)
80
+ text = re.sub(r"\s+", " ", text).strip()
81
+ text = text[:4500]
82
+ gTTS(text=text, lang="en").save(out_path)
src/backend/auth.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def require_login(request: gr.Request) -> str:
4
+ username = getattr(request, "username", None)
5
+ if not username:
6
+ raise gr.Error("Please log in using 'Sign in with Hugging Face' to use this app.")
7
+ return username
src/backend/ingest.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, pathlib
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from pypdf import PdfReader
5
+ from pptx import Presentation
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ from src.storage.paths import nb_root, ensure_tree
9
+ from src.storage.chroma_store import get_collection
10
+ from src.utils.text import safe_name
11
+
12
+ EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
13
+
14
+ def simple_chunk(text: str, max_chars=2200, overlap=250):
15
+ text = "\n".join([ln.strip() for ln in (text or "").splitlines() if ln.strip()]).strip()
16
+ if not text:
17
+ return []
18
+ if len(text) <= max_chars:
19
+ return [text]
20
+ out, start = [], 0
21
+ while start < len(text):
22
+ end = min(len(text), start + max_chars)
23
+ out.append(text[start:end])
24
+ if end == len(text): break
25
+ start = max(0, end - overlap)
26
+ return out
27
+
28
+ def extract_pdf(path: str):
29
+ reader = PdfReader(path)
30
+ items = []
31
+ for i, page in enumerate(reader.pages):
32
+ txt = (page.extract_text() or "").strip()
33
+ if txt:
34
+ items.append({"text": txt, "page": i+1})
35
+ return items
36
+
37
+ def extract_pptx(path: str):
38
+ prs = Presentation(path)
39
+ items = []
40
+ for i, slide in enumerate(prs.slides):
41
+ texts = []
42
+ for shape in slide.shapes:
43
+ if hasattr(shape, "text") and shape.text:
44
+ texts.append(shape.text)
45
+ txt = "\n".join(t.strip() for t in texts if t.strip()).strip()
46
+ if txt:
47
+ items.append({"text": txt, "slide": i+1})
48
+ return items
49
+
50
+ def extract_txt(path: str):
51
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
52
+ txt = f.read().strip()
53
+ return [{"text": txt, "page": None}] if txt else []
54
+
55
+ def extract_url(url: str):
56
+ r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
57
+ r.raise_for_status()
58
+ soup = BeautifulSoup(r.text, "html.parser")
59
+ for tag in soup(["script","style","noscript"]):
60
+ tag.decompose()
61
+ text = soup.get_text("\n")
62
+ text = "\n".join([ln.strip() for ln in text.splitlines() if ln.strip()])
63
+ return [{"text": text[:200000], "page": None}]
64
+
65
+ def upsert_extracted(username: str, notebook_id: str, source_title: str, source_id: str, extracted_items: list[dict]) -> int:
66
+ col = get_collection(username, notebook_id)
67
+ ids, docs, metas = [], [], []
68
+ for item in extracted_items:
69
+ for j, ch in enumerate(simple_chunk(item["text"])):
70
+ ids.append(f"{source_id}::chunk{j}")
71
+ docs.append(ch)
72
+ metas.append({
73
+ "source_title": source_title,
74
+ "source_id": source_id,
75
+ "page": item.get("page"),
76
+ "slide": item.get("slide"),
77
+ })
78
+ if not docs:
79
+ return 0
80
+ embs = EMBED_MODEL.encode(docs, normalize_embeddings=True).tolist()
81
+ col.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
82
+ return len(docs)
83
+
84
+ def ingest_files(username: str, notebook_id: str, filepaths: list[str]) -> int:
85
+ ensure_tree(username, notebook_id)
86
+ raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
87
+ ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
88
+ added = 0
89
+
90
+ for fp in filepaths:
91
+ dest = os.path.join(raw_dir, os.path.basename(fp))
92
+ pathlib.Path(dest).write_bytes(pathlib.Path(fp).read_bytes())
93
+
94
+ ext = os.path.splitext(dest)[1].lower()
95
+ if ext == ".pdf":
96
+ extracted = extract_pdf(dest)
97
+ elif ext == ".pptx":
98
+ extracted = extract_pptx(dest)
99
+ elif ext in [".txt", ".md"]:
100
+ extracted = extract_txt(dest)
101
+ else:
102
+ continue
103
+
104
+ # save extracted
105
+ ex_path = os.path.join(ex_dir, os.path.basename(dest) + ".txt")
106
+ with open(ex_path, "w", encoding="utf-8") as f:
107
+ for item in extracted:
108
+ loc = f"page={item.get('page')}" if item.get("page") else f"slide={item.get('slide')}" if item.get("slide") else ""
109
+ f.write(f"\n--- {loc} ---\n{item['text']}\n")
110
+
111
+ added += upsert_extracted(username, notebook_id, os.path.basename(dest), f"file:{os.path.basename(dest)}", extracted)
112
+
113
+ return added
114
+
115
+ def ingest_url(username: str, notebook_id: str, url: str) -> int:
116
+ ensure_tree(username, notebook_id)
117
+ extracted = extract_url(url)
118
+ ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
119
+ fname = safe_name(url.replace("https://","").replace("http://","").replace("/","_")) + ".txt"
120
+ with open(os.path.join(ex_dir, fname), "w", encoding="utf-8") as f:
121
+ f.write(extracted[0]["text"])
122
+ return upsert_extracted(username, notebook_id, url, f"url:{url}", extracted)
src/backend/llm.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from huggingface_hub import InferenceClient
4
+
5
+ HF_INFERENCE_TOKEN = os.environ.get("HF_INFERENCE_TOKEN","").strip()
6
+ HF_LLM_MODEL = os.environ.get("HF_LLM_MODEL","HuggingFaceH4/zephyr-7b-beta").strip()
7
+
8
+ _client = InferenceClient(model=HF_LLM_MODEL, token=HF_INFERENCE_TOKEN) if HF_INFERENCE_TOKEN else None
9
+
10
+ def llm_generate(prompt: str, max_new_tokens=450, temperature=0.2) -> str:
11
+ if _client is None:
12
+ raise gr.Error("HF_INFERENCE_TOKEN not set. Add it in Space secrets.")
13
+ out = _client.text_generation(
14
+ prompt,
15
+ max_new_tokens=max_new_tokens,
16
+ temperature=temperature,
17
+ do_sample=temperature > 0,
18
+ return_full_text=False,
19
+ )
20
+ return (out or "").strip()
src/backend/notebooks.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ import gradio as gr
3
+ from src.storage.index_store import load_index, save_index, list_notebooks
4
+ from src.storage.paths import ensure_tree
5
+ from src.utils.text import safe_name
6
+ from datetime import datetime
7
+
8
+ def now_iso():
9
+ return datetime.utcnow().isoformat() + "Z"
10
+
11
+ def create_notebook(username: str, name: str) -> str:
12
+ name = safe_name(name)
13
+ idx = load_index(username)
14
+ nb_id = str(uuid.uuid4())
15
+ idx["notebooks"].append({
16
+ "id": nb_id,
17
+ "name": name,
18
+ "created_at": now_iso(),
19
+ "updated_at": now_iso(),
20
+ })
21
+ save_index(username, idx)
22
+ ensure_tree(username, nb_id)
23
+ return nb_id
24
+
25
+ def rename_notebook(username: str, notebook_id: str, new_name: str):
26
+ new_name = safe_name(new_name)
27
+ if not new_name:
28
+ raise gr.Error("Notebook name cannot be empty.")
29
+ idx = load_index(username)
30
+ found = False
31
+ for nb in idx.get("notebooks", []):
32
+ if nb["id"] == notebook_id:
33
+ nb["name"] = new_name
34
+ nb["updated_at"] = now_iso()
35
+ found = True
36
+ break
37
+ if not found:
38
+ raise gr.Error("Notebook not found.")
39
+ save_index(username, idx)
40
+
41
+ def delete_notebook(username: str, notebook_id: str):
42
+ import shutil, os
43
+ from src.storage.paths import nb_root
44
+ idx = load_index(username)
45
+ idx["notebooks"] = [n for n in idx.get("notebooks", []) if n["id"] != notebook_id]
46
+ save_index(username, idx)
47
+ base = nb_root(username, notebook_id)
48
+ if os.path.exists(base):
49
+ shutil.rmtree(base, ignore_errors=True)
src/backend/rag.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from src.storage.chroma_store import get_collection
3
+ from src.backend.llm import llm_generate
4
+
5
+ EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
6
+
7
+ def retrieve(username: str, notebook_id: str, query: str, k=6):
8
+ col = get_collection(username, notebook_id)
9
+ qemb = EMBED_MODEL.encode([query], normalize_embeddings=True).tolist()
10
+ res = col.query(query_embeddings=qemb, n_results=k, include=["documents","metadatas","ids"])
11
+ hits = []
12
+ for i in range(len(res["ids"][0])):
13
+ hits.append({"id": res["ids"][0][i], "doc": res["documents"][0][i], "meta": res["metadatas"][0][i]})
14
+ return hits
15
+
16
+ def format_sources(hits):
17
+ lines = []
18
+ for i, h in enumerate(hits, start=1):
19
+ m = h["meta"]
20
+ loc = ""
21
+ if m.get("page"): loc = f"p.{m['page']}"
22
+ if m.get("slide"): loc = f"slide {m['slide']}"
23
+ lines.append(f"[S{i}] {m.get('source_title','source')} {loc}".strip())
24
+ return "\n".join(lines)
25
+
26
+ def context_block(hits):
27
+ blocks = []
28
+ for i, h in enumerate(hits, start=1):
29
+ m = h["meta"]
30
+ loc = ""
31
+ if m.get("page"): loc = f"(page {m['page']})"
32
+ if m.get("slide"): loc = f"(slide {m['slide']})"
33
+ blocks.append(f"[S{i}] {m.get('source_title','source')} {loc}\n{h['doc']}")
34
+ return "\n\n---\n\n".join(blocks)
35
+
36
+ def rag_answer(query: str, hits):
37
+ if not hits:
38
+ return "Not found in the provided sources. (No indexed chunks yet.)"
39
+ prompt = f"""
40
+ You are a research assistant. Answer ONLY using the sources below.
41
+ Every non-trivial claim must end with citations like [S1] or [S2].
42
+ If not present in sources, say: Not found in the provided sources.
43
+
44
+ Question:
45
+ {query}
46
+
47
+ Sources list:
48
+ {format_sources(hits)}
49
+
50
+ Source excerpts:
51
+ {context_block(hits)}
52
+
53
+ Answer with citations:
54
+ """
55
+ ans = llm_generate(prompt, max_new_tokens=450, temperature=0.2)
56
+ return f"{ans}\n\nSources:\n{format_sources(hits)}"
src/frontend/__pycache__/callbacks.cpython-310.pyc ADDED
Binary file (4.91 kB). View file
 
src/frontend/__pycache__/ui.cpython-310.pyc ADDED
Binary file (3.19 kB). View file
 
src/frontend/callbacks.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from datetime import datetime
3
+ import gradio as gr
4
+
5
+ from src.backend.notebooks import create_notebook, rename_notebook, delete_notebook
6
+ from src.storage.index_store import list_notebooks
7
+ from src.storage.paths import ensure_tree
8
+ from src.storage.chat_store import append_chat, load_chat
9
+ from src.storage.artifact_store import list_artifacts as list_artifacts_store, next_artifact_path
10
+ from src.backend.ingest import ingest_files as ingest_files_backend, ingest_url as ingest_url_backend
11
+ from src.backend.rag import retrieve, rag_answer
12
+ from src.backend.artifacts import generate_report, generate_quiz, generate_podcast_transcript, transcript_to_mp3
13
+
14
+ def now_iso():
15
+ return datetime.utcnow().isoformat() + "Z"
16
+
17
+ def chat_pairs(history):
18
+ pairs = []
19
+ last_user = None
20
+ for m in history:
21
+ if m.get("role") == "user":
22
+ last_user = m.get("content","")
23
+ elif m.get("role") == "assistant":
24
+ pairs.append((last_user or "", m.get("content","")))
25
+ last_user = None
26
+ return pairs
27
+
28
+ def ui_bootstrap(username: str):
29
+ nbs = list_notebooks(username)
30
+ if not nbs:
31
+ nb_id = create_notebook(username, "My First Notebook")
32
+ nbs = list_notebooks(username)
33
+ current = nb_id
34
+ else:
35
+ current = nbs[0][1]
36
+ ensure_tree(username, current)
37
+ history = load_chat(username, current)
38
+ return gr.Dropdown(choices=nbs, value=current), chat_pairs(history), list_artifacts_store(username, current)
39
+
40
+ def on_switch_notebook(username: str, notebook_id: str):
41
+ ensure_tree(username, notebook_id)
42
+ history = load_chat(username, notebook_id)
43
+ return chat_pairs(history), list_artifacts_store(username, notebook_id)
44
+
45
+ def on_create_notebook(username: str, name: str):
46
+ nb_id = create_notebook(username, name)
47
+ nbs = list_notebooks(username)
48
+ return gr.Dropdown(choices=nbs, value=nb_id), [], list_artifacts_store(username, nb_id)
49
+
50
+ def on_rename_notebook(username: str, notebook_id: str, new_name: str):
51
+ rename_notebook(username, notebook_id, new_name)
52
+ return gr.Dropdown(choices=list_notebooks(username), value=notebook_id)
53
+
54
+ def on_delete_notebook(username: str, notebook_id: str):
55
+ delete_notebook(username, notebook_id)
56
+ return ui_bootstrap(username)
57
+
58
+ def on_ingest_files(username: str, notebook_id: str, files):
59
+ if not files:
60
+ raise gr.Error("Upload at least one file.")
61
+ added = ingest_files_backend(username, notebook_id, files)
62
+ return f"Ingested files. Added {added} chunks."
63
+
64
+ def on_ingest_url(username: str, notebook_id: str, url: str):
65
+ url = (url or "").strip()
66
+ if not url:
67
+ raise gr.Error("Enter a URL.")
68
+ added = ingest_url_backend(username, notebook_id, url)
69
+ return f"Ingested URL. Added {added} chunks."
70
+
71
+ def on_chat(username: str, notebook_id: str, chatbot, msg: str):
72
+ msg = (msg or "").strip()
73
+ if not msg:
74
+ return chatbot, ""
75
+ t0 = time.time()
76
+ append_chat(username, notebook_id, {"role":"user","content":msg,"ts":now_iso()})
77
+ hits = retrieve(username, notebook_id, msg, k=6)
78
+ ans = rag_answer(msg, hits)
79
+ append_chat(username, notebook_id, {"role":"assistant","content":ans,"ts":now_iso(),"latency_ms":int((time.time()-t0)*1000)})
80
+ chatbot = chatbot + [(msg, ans)]
81
+ return chatbot, ""
82
+
83
+ def on_report(username: str, notebook_id: str, topic: str, extra: str):
84
+ topic = (topic or "").strip()
85
+ if not topic:
86
+ raise gr.Error("Enter a topic.")
87
+ hits = retrieve(username, notebook_id, topic, k=6)
88
+ if not hits:
89
+ raise gr.Error("No sources yet. Ingest first.")
90
+ md = generate_report(topic, hits, extra)
91
+ out = next_artifact_path(username, notebook_id, "reports", ".md")
92
+ open(out, "w", encoding="utf-8").write(md)
93
+ return "Report generated.", list_artifacts_store(username, notebook_id), out
94
+
95
+ def on_quiz(username: str, notebook_id: str, topic: str, extra: str):
96
+ topic = (topic or "").strip()
97
+ if not topic:
98
+ raise gr.Error("Enter a topic.")
99
+ hits = retrieve(username, notebook_id, topic, k=6)
100
+ if not hits:
101
+ raise gr.Error("No sources yet. Ingest first.")
102
+ md = generate_quiz(topic, hits, extra)
103
+ out = next_artifact_path(username, notebook_id, "quizzes", ".md")
104
+ open(out, "w", encoding="utf-8").write(md)
105
+ return "Quiz generated.", list_artifacts_store(username, notebook_id), out
106
+
107
+ def on_podcast(username: str, notebook_id: str, topic: str, extra: str):
108
+ topic = (topic or "").strip()
109
+ if not topic:
110
+ raise gr.Error("Enter a topic.")
111
+ hits = retrieve(username, notebook_id, topic, k=6)
112
+ if not hits:
113
+ raise gr.Error("No sources yet. Ingest first.")
114
+ md = generate_podcast_transcript(topic, hits, extra)
115
+ md_path = next_artifact_path(username, notebook_id, "podcasts", ".md")
116
+ open(md_path, "w", encoding="utf-8").write(md)
117
+
118
+ mp3_path = next_artifact_path(username, notebook_id, "podcasts", ".mp3")
119
+ transcript_to_mp3(md, mp3_path)
120
+
121
+ return "Podcast generated.", list_artifacts_store(username, notebook_id), md_path, mp3_path
122
+
123
+ def on_download(username: str, notebook_id: str, selection: str):
124
+ import os
125
+ from src.storage.paths import nb_root
126
+ if not selection:
127
+ return None
128
+ p = os.path.join(nb_root(username, notebook_id), "artifacts", selection)
129
+ return p if os.path.exists(p) else None
src/frontend/ui.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.frontend.callbacks import (
3
+ ui_bootstrap, on_switch_notebook, on_create_notebook, on_rename_notebook, on_delete_notebook,
4
+ on_ingest_files, on_ingest_url, on_chat, on_report, on_quiz, on_podcast, on_download
5
+ )
6
+ from src.backend.auth import require_login
7
+
8
+ def build_app():
9
+ with gr.Blocks(title="NotebookLM Clone") as demo:
10
+ gr.Markdown("# 📓 NotebookLM Clone (HF Auth + Chroma + RAG)")
11
+
12
+ login = gr.LoginButton()
13
+ username_state = gr.State("")
14
+
15
+ def on_load(request: gr.Request):
16
+ username = require_login(request)
17
+ dd, chat, arts = ui_bootstrap(username)
18
+ return username, dd, chat, arts
19
+
20
+ with gr.Row():
21
+ with gr.Column(scale=1):
22
+ user_box = gr.Textbox(label="User", interactive=False)
23
+ notebook_dd = gr.Dropdown(label="Notebooks", choices=[], interactive=True)
24
+
25
+ nb_new = gr.Textbox(label="Create notebook", placeholder="Name")
26
+ btn_create = gr.Button("Create")
27
+
28
+ nb_rename = gr.Textbox(label="Rename notebook", placeholder="New name")
29
+ btn_rename = gr.Button("Rename")
30
+
31
+ btn_delete = gr.Button("Delete current", variant="stop")
32
+
33
+ gr.Markdown("## Ingest")
34
+ file_up = gr.File(label="Upload PDF/PPTX/TXT", file_count="multiple")
35
+ btn_ingest_files = gr.Button("Ingest Files")
36
+ ingest_status = gr.Textbox(label="Status", interactive=False)
37
+
38
+ url_in = gr.Textbox(label="URL", placeholder="https://...")
39
+ btn_ingest_url = gr.Button("Ingest URL")
40
+ url_status = gr.Textbox(label="Status", interactive=False)
41
+
42
+ gr.Markdown("## Artifacts")
43
+ topic = gr.Textbox(label="Topic / prompt")
44
+ extra = gr.Textbox(label="Extra prompt (optional)")
45
+ btn_report = gr.Button("Generate Report")
46
+ btn_quiz = gr.Button("Generate Quiz")
47
+ btn_podcast = gr.Button("Generate Podcast")
48
+
49
+ artifact_status = gr.Textbox(label="Artifact status", interactive=False)
50
+ artifacts_list = gr.Dropdown(label="Artifacts", choices=[], interactive=True)
51
+ download_btn = gr.Button("Download selected")
52
+ download_file = gr.File(label="Download", interactive=False)
53
+ podcast_audio = gr.Audio(label="Podcast Audio", interactive=False)
54
+
55
+ with gr.Column(scale=2):
56
+ chatbot = gr.Chatbot(height=520, label="Chat (RAG + citations)")
57
+ msg = gr.Textbox(label="Message")
58
+ send = gr.Button("Send")
59
+
60
+ demo.load(on_load, inputs=None, outputs=[username_state, notebook_dd, chatbot, artifacts_list], queue=False)
61
+ username_state.change(lambda u: u, inputs=username_state, outputs=user_box, queue=False)
62
+
63
+ notebook_dd.change(on_switch_notebook, inputs=[username_state, notebook_dd], outputs=[chatbot, artifacts_list], queue=False)
64
+
65
+ btn_create.click(on_create_notebook, inputs=[username_state, nb_new], outputs=[notebook_dd, chatbot, artifacts_list], queue=False)
66
+ btn_rename.click(on_rename_notebook, inputs=[username_state, notebook_dd, nb_rename], outputs=[notebook_dd], queue=False)
67
+ btn_delete.click(on_delete_notebook, inputs=[username_state, notebook_dd], outputs=[notebook_dd, chatbot, artifacts_list], queue=False)
68
+
69
+ btn_ingest_files.click(on_ingest_files, inputs=[username_state, notebook_dd, file_up], outputs=[ingest_status], queue=True)
70
+ btn_ingest_url.click(on_ingest_url, inputs=[username_state, notebook_dd, url_in], outputs=[url_status], queue=True)
71
+
72
+ send.click(on_chat, inputs=[username_state, notebook_dd, chatbot, msg], outputs=[chatbot, msg], queue=True)
73
+
74
+ btn_report.click(on_report, inputs=[username_state, notebook_dd, topic, extra], outputs=[artifact_status, artifacts_list, download_file], queue=True)
75
+ btn_quiz.click(on_quiz, inputs=[username_state, notebook_dd, topic, extra], outputs=[artifact_status, artifacts_list, download_file], queue=True)
76
+ btn_podcast.click(on_podcast, inputs=[username_state, notebook_dd, topic, extra], outputs=[artifact_status, artifacts_list, download_file, podcast_audio], queue=True)
77
+
78
+ download_btn.click(on_download, inputs=[username_state, notebook_dd, artifacts_list], outputs=[download_file], queue=False)
79
+
80
+ return demo
src/storage/__pycache__/artifact_store.cpython-310.pyc ADDED
Binary file (1.18 kB). View file
 
src/storage/__pycache__/chat_store.cpython-310.pyc ADDED
Binary file (1.17 kB). View file
 
src/storage/__pycache__/chroma_store.cpython-310.pyc ADDED
Binary file (830 Bytes). View file
 
src/storage/__pycache__/index_store.cpython-310.pyc ADDED
Binary file (1.73 kB). View file
 
src/storage/__pycache__/paths.cpython-310.pyc ADDED
Binary file (1.26 kB). View file
 
src/storage/artifact_store.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from src.storage.paths import nb_root
3
+
4
+ def list_artifacts(username: str, notebook_id: str):
5
+ base = os.path.join(nb_root(username, notebook_id), "artifacts")
6
+ out = []
7
+ for kind in ["reports","quizzes","podcasts"]:
8
+ kdir = os.path.join(base, kind)
9
+ if not os.path.exists(kdir):
10
+ continue
11
+ for fn in sorted(os.listdir(kdir)):
12
+ out.append(f"{kind}/{fn}")
13
+ return out
14
+
15
+ def next_artifact_path(username: str, notebook_id: str, kind: str, ext: str):
16
+ base = os.path.join(nb_root(username, notebook_id), "artifacts", kind)
17
+ os.makedirs(base, exist_ok=True)
18
+ existing = [p for p in os.listdir(base) if p.endswith(ext)]
19
+ n = len(existing) + 1
20
+ prefix = {"reports":"report","quizzes":"quiz","podcasts":"podcast"}[kind]
21
+ return os.path.join(base, f"{prefix}_{n}{ext}")
src/storage/chat_store.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ from src.storage.paths import nb_root
3
+
4
+ def chat_path(username: str, notebook_id: str) -> str:
5
+ return os.path.join(nb_root(username, notebook_id), "chat", "messages.jsonl")
6
+
7
+ def append_chat(username: str, notebook_id: str, obj: dict):
8
+ p = chat_path(username, notebook_id)
9
+ os.makedirs(os.path.dirname(p), exist_ok=True)
10
+ with open(p, "a", encoding="utf-8") as f:
11
+ f.write(json.dumps(obj, ensure_ascii=False) + "\n")
12
+
13
+ def load_chat(username: str, notebook_id: str):
14
+ p = chat_path(username, notebook_id)
15
+ if not os.path.exists(p):
16
+ return []
17
+ out = []
18
+ with open(p, "r", encoding="utf-8") as f:
19
+ for line in f:
20
+ try: out.append(json.loads(line))
21
+ except: pass
22
+ return out
src/storage/chroma_store.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+ from chromadb.config import Settings
4
+ from src.storage.paths import nb_root
5
+
6
+ def chroma_client(username: str, notebook_id: str):
7
+ persist_dir = os.path.join(nb_root(username, notebook_id), "chroma")
8
+ os.makedirs(persist_dir, exist_ok=True)
9
+ return chromadb.PersistentClient(path=persist_dir, settings=Settings(anonymized_telemetry=False))
10
+
11
+ def get_collection(username: str, notebook_id: str):
12
+ client = chroma_client(username, notebook_id)
13
+ return client.get_or_create_collection(name="docs")
src/storage/index_store.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ from datetime import datetime
3
+ from .paths import user_root, index_path, ensure_tree
4
+
5
+ def now_iso():
6
+ return datetime.utcnow().isoformat() + "Z"
7
+
8
+ def load_index(username: str) -> dict:
9
+ os.makedirs(user_root(username), exist_ok=True)
10
+ p = index_path(username)
11
+ if not os.path.exists(p):
12
+ with open(p, "w", encoding="utf-8") as f:
13
+ json.dump({"notebooks": []}, f, indent=2)
14
+ with open(p, "r", encoding="utf-8") as f:
15
+ return json.load(f)
16
+
17
+ def save_index(username: str, idx: dict):
18
+ with open(index_path(username), "w", encoding="utf-8") as f:
19
+ json.dump(idx, f, indent=2)
20
+
21
+ def list_notebooks(username: str):
22
+ idx = load_index(username)
23
+ return [(nb["name"], nb["id"]) for nb in idx.get("notebooks", [])]
24
+
25
+ def touch_updated(username: str, notebook_id: str):
26
+ idx = load_index(username)
27
+ for nb in idx.get("notebooks", []):
28
+ if nb["id"] == notebook_id:
29
+ nb["updated_at"] = now_iso()
30
+ break
31
+ save_index(username, idx)
src/storage/paths.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ # If DATA_ROOT env var is not set:
5
+ # - Locally: write to ./data (project folder)
6
+ # - On HF: you will set DATA_ROOT=/data in Space variables (or leave it as /data there)
7
+ DEFAULT_LOCAL_DATA = str(Path(__file__).resolve().parents[2] / "data")
8
+
9
+ DATA_ROOT = os.environ.get("DATA_ROOT", DEFAULT_LOCAL_DATA)
10
+
11
+ def user_root(username: str) -> str:
12
+ return os.path.join(DATA_ROOT, "users", username, "notebooks")
13
+
14
+ def index_path(username: str) -> str:
15
+ return os.path.join(user_root(username), "index.json")
16
+
17
+ def nb_root(username: str, notebook_id: str) -> str:
18
+ return os.path.join(user_root(username), notebook_id)
19
+
20
+ def ensure_tree(username: str, notebook_id: str):
21
+ base = nb_root(username, notebook_id)
22
+ paths = [
23
+ user_root(username),
24
+ os.path.join(base, "files_raw"),
25
+ os.path.join(base, "files_extracted"),
26
+ os.path.join(base, "chroma"),
27
+ os.path.join(base, "chat"),
28
+ os.path.join(base, "artifacts", "reports"),
29
+ os.path.join(base, "artifacts", "quizzes"),
30
+ os.path.join(base, "artifacts", "podcasts"),
31
+ ]
32
+ for p in paths:
33
+ os.makedirs(p, exist_ok=True)
src/utils/__pycache__/text.cpython-310.pyc ADDED
Binary file (423 Bytes). View file
 
src/utils/text.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def safe_name(s: str) -> str:
4
+ s = (s or "").strip()
5
+ s = re.sub(r"[^a-zA-Z0-9_\- ]+", "", s)
6
+ s = re.sub(r"\s+", " ", s).strip()
7
+ return s[:60] if s else "Untitled"
src/utils/timing.py ADDED
File without changes