Spaces:
Runtime error
Runtime error
Commit ·
b4c7867
1
Parent(s): e0ae835
NotebookLM Clone
Browse files- README.md +0 -12
- app.py +6 -0
- requirements.txt +11 -0
- src/backend/__pycache__/artifacts.cpython-310.pyc +0 -0
- src/backend/__pycache__/auth.cpython-310.pyc +0 -0
- src/backend/__pycache__/ingest.cpython-310.pyc +0 -0
- src/backend/__pycache__/llm.cpython-310.pyc +0 -0
- src/backend/__pycache__/notebooks.cpython-310.pyc +0 -0
- src/backend/__pycache__/rag.cpython-310.pyc +0 -0
- src/backend/artifacts.py +82 -0
- src/backend/auth.py +7 -0
- src/backend/ingest.py +122 -0
- src/backend/llm.py +20 -0
- src/backend/notebooks.py +49 -0
- src/backend/rag.py +56 -0
- src/frontend/__pycache__/callbacks.cpython-310.pyc +0 -0
- src/frontend/__pycache__/ui.cpython-310.pyc +0 -0
- src/frontend/callbacks.py +129 -0
- src/frontend/ui.py +80 -0
- src/storage/__pycache__/artifact_store.cpython-310.pyc +0 -0
- src/storage/__pycache__/chat_store.cpython-310.pyc +0 -0
- src/storage/__pycache__/chroma_store.cpython-310.pyc +0 -0
- src/storage/__pycache__/index_store.cpython-310.pyc +0 -0
- src/storage/__pycache__/paths.cpython-310.pyc +0 -0
- src/storage/artifact_store.py +21 -0
- src/storage/chat_store.py +22 -0
- src/storage/chroma_store.py +13 -0
- src/storage/index_store.py +31 -0
- src/storage/paths.py +33 -0
- src/utils/__pycache__/text.cpython-310.pyc +0 -0
- src/utils/text.py +7 -0
- src/utils/timing.py +0 -0
README.md
CHANGED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: GPP1
|
| 3 |
-
emoji: 🏆
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: green
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 6.8.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.frontend.ui import build_app
|
| 2 |
+
|
| 3 |
+
demo = build_app()
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio[oauth]==4.44.1
|
| 2 |
+
chromadb==0.5.5
|
| 3 |
+
sentence-transformers==3.0.1
|
| 4 |
+
pypdf==4.3.1
|
| 5 |
+
python-pptx==1.0.2
|
| 6 |
+
beautifulsoup4==4.12.3
|
| 7 |
+
requests==2.32.3
|
| 8 |
+
gTTS==2.5.3
|
| 9 |
+
huggingface_hub==0.24.6
|
| 10 |
+
|
| 11 |
+
#hugging face token = REMOVED_SECRET
|
src/backend/__pycache__/artifacts.cpython-310.pyc
ADDED
|
Binary file (2.12 kB). View file
|
|
|
src/backend/__pycache__/auth.cpython-310.pyc
ADDED
|
Binary file (461 Bytes). View file
|
|
|
src/backend/__pycache__/ingest.cpython-310.pyc
ADDED
|
Binary file (4.94 kB). View file
|
|
|
src/backend/__pycache__/llm.cpython-310.pyc
ADDED
|
Binary file (841 Bytes). View file
|
|
|
src/backend/__pycache__/notebooks.cpython-310.pyc
ADDED
|
Binary file (1.9 kB). View file
|
|
|
src/backend/__pycache__/rag.cpython-310.pyc
ADDED
|
Binary file (2.36 kB). View file
|
|
|
src/backend/artifacts.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from gtts import gTTS
|
| 3 |
+
from src.backend.llm import llm_generate
|
| 4 |
+
from src.backend.rag import format_sources, context_block
|
| 5 |
+
|
| 6 |
+
def generate_report(topic: str, hits, extra_prompt: str):
|
| 7 |
+
prompt = f"""
|
| 8 |
+
Write a markdown study report grounded ONLY in the sources.
|
| 9 |
+
Every non-trivial claim must include citations like [S1].
|
| 10 |
+
|
| 11 |
+
Topic: {topic}
|
| 12 |
+
Extra instructions: {extra_prompt or "(none)"}
|
| 13 |
+
|
| 14 |
+
Sources list:
|
| 15 |
+
{format_sources(hits)}
|
| 16 |
+
|
| 17 |
+
Excerpts:
|
| 18 |
+
{context_block(hits)}
|
| 19 |
+
|
| 20 |
+
Output:
|
| 21 |
+
# Report
|
| 22 |
+
## Key Concepts
|
| 23 |
+
## Detailed Notes
|
| 24 |
+
## Key Takeaways
|
| 25 |
+
"""
|
| 26 |
+
return llm_generate(prompt, max_new_tokens=900, temperature=0.25)
|
| 27 |
+
|
| 28 |
+
def generate_quiz(topic: str, hits, extra_prompt: str):
|
| 29 |
+
prompt = f"""
|
| 30 |
+
Write a markdown quiz grounded ONLY in the sources.
|
| 31 |
+
Create 8 questions:
|
| 32 |
+
- 5 multiple choice
|
| 33 |
+
- 3 short answer
|
| 34 |
+
Then include an Answer Key with explanations.
|
| 35 |
+
Explanations must include citations like [S1].
|
| 36 |
+
|
| 37 |
+
Topic: {topic}
|
| 38 |
+
Extra instructions: {extra_prompt or "(none)"}
|
| 39 |
+
|
| 40 |
+
Sources list:
|
| 41 |
+
{format_sources(hits)}
|
| 42 |
+
|
| 43 |
+
Excerpts:
|
| 44 |
+
{context_block(hits)}
|
| 45 |
+
|
| 46 |
+
Output:
|
| 47 |
+
# Quiz
|
| 48 |
+
## Questions
|
| 49 |
+
## Answer Key
|
| 50 |
+
"""
|
| 51 |
+
return llm_generate(prompt, max_new_tokens=900, temperature=0.25)
|
| 52 |
+
|
| 53 |
+
def generate_podcast_transcript(topic: str, hits, extra_prompt: str):
|
| 54 |
+
prompt = f"""
|
| 55 |
+
Write a markdown podcast transcript grounded ONLY in the sources.
|
| 56 |
+
Two speakers: Speaker 1 and Speaker 2.
|
| 57 |
+
Every non-trivial claim must include citations like [S1].
|
| 58 |
+
|
| 59 |
+
Topic: {topic}
|
| 60 |
+
Extra instructions: {extra_prompt or "(none)"}
|
| 61 |
+
|
| 62 |
+
Sources list:
|
| 63 |
+
{format_sources(hits)}
|
| 64 |
+
|
| 65 |
+
Excerpts:
|
| 66 |
+
{context_block(hits)}
|
| 67 |
+
|
| 68 |
+
Output:
|
| 69 |
+
# Podcast Transcript
|
| 70 |
+
**Speaker 1:** ...
|
| 71 |
+
**Speaker 2:** ...
|
| 72 |
+
End with Sources section.
|
| 73 |
+
"""
|
| 74 |
+
return llm_generate(prompt, max_new_tokens=900, temperature=0.3)
|
| 75 |
+
|
| 76 |
+
def transcript_to_mp3(transcript_md: str, out_path: str):
|
| 77 |
+
text = re.sub(r"\[(S\d+)\]", "", transcript_md)
|
| 78 |
+
text = re.sub(r"#+", "", text)
|
| 79 |
+
text = re.sub(r"\*\*", "", text)
|
| 80 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 81 |
+
text = text[:4500]
|
| 82 |
+
gTTS(text=text, lang="en").save(out_path)
|
src/backend/auth.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
def require_login(request: gr.Request) -> str:
|
| 4 |
+
username = getattr(request, "username", None)
|
| 5 |
+
if not username:
|
| 6 |
+
raise gr.Error("Please log in using 'Sign in with Hugging Face' to use this app.")
|
| 7 |
+
return username
|
src/backend/ingest.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, pathlib
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
from pypdf import PdfReader
|
| 5 |
+
from pptx import Presentation
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
from src.storage.paths import nb_root, ensure_tree
|
| 9 |
+
from src.storage.chroma_store import get_collection
|
| 10 |
+
from src.utils.text import safe_name
|
| 11 |
+
|
| 12 |
+
EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 13 |
+
|
| 14 |
+
def simple_chunk(text: str, max_chars=2200, overlap=250):
|
| 15 |
+
text = "\n".join([ln.strip() for ln in (text or "").splitlines() if ln.strip()]).strip()
|
| 16 |
+
if not text:
|
| 17 |
+
return []
|
| 18 |
+
if len(text) <= max_chars:
|
| 19 |
+
return [text]
|
| 20 |
+
out, start = [], 0
|
| 21 |
+
while start < len(text):
|
| 22 |
+
end = min(len(text), start + max_chars)
|
| 23 |
+
out.append(text[start:end])
|
| 24 |
+
if end == len(text): break
|
| 25 |
+
start = max(0, end - overlap)
|
| 26 |
+
return out
|
| 27 |
+
|
| 28 |
+
def extract_pdf(path: str):
|
| 29 |
+
reader = PdfReader(path)
|
| 30 |
+
items = []
|
| 31 |
+
for i, page in enumerate(reader.pages):
|
| 32 |
+
txt = (page.extract_text() or "").strip()
|
| 33 |
+
if txt:
|
| 34 |
+
items.append({"text": txt, "page": i+1})
|
| 35 |
+
return items
|
| 36 |
+
|
| 37 |
+
def extract_pptx(path: str):
|
| 38 |
+
prs = Presentation(path)
|
| 39 |
+
items = []
|
| 40 |
+
for i, slide in enumerate(prs.slides):
|
| 41 |
+
texts = []
|
| 42 |
+
for shape in slide.shapes:
|
| 43 |
+
if hasattr(shape, "text") and shape.text:
|
| 44 |
+
texts.append(shape.text)
|
| 45 |
+
txt = "\n".join(t.strip() for t in texts if t.strip()).strip()
|
| 46 |
+
if txt:
|
| 47 |
+
items.append({"text": txt, "slide": i+1})
|
| 48 |
+
return items
|
| 49 |
+
|
| 50 |
+
def extract_txt(path: str):
|
| 51 |
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 52 |
+
txt = f.read().strip()
|
| 53 |
+
return [{"text": txt, "page": None}] if txt else []
|
| 54 |
+
|
| 55 |
+
def extract_url(url: str):
|
| 56 |
+
r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
|
| 57 |
+
r.raise_for_status()
|
| 58 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 59 |
+
for tag in soup(["script","style","noscript"]):
|
| 60 |
+
tag.decompose()
|
| 61 |
+
text = soup.get_text("\n")
|
| 62 |
+
text = "\n".join([ln.strip() for ln in text.splitlines() if ln.strip()])
|
| 63 |
+
return [{"text": text[:200000], "page": None}]
|
| 64 |
+
|
| 65 |
+
def upsert_extracted(username: str, notebook_id: str, source_title: str, source_id: str, extracted_items: list[dict]) -> int:
|
| 66 |
+
col = get_collection(username, notebook_id)
|
| 67 |
+
ids, docs, metas = [], [], []
|
| 68 |
+
for item in extracted_items:
|
| 69 |
+
for j, ch in enumerate(simple_chunk(item["text"])):
|
| 70 |
+
ids.append(f"{source_id}::chunk{j}")
|
| 71 |
+
docs.append(ch)
|
| 72 |
+
metas.append({
|
| 73 |
+
"source_title": source_title,
|
| 74 |
+
"source_id": source_id,
|
| 75 |
+
"page": item.get("page"),
|
| 76 |
+
"slide": item.get("slide"),
|
| 77 |
+
})
|
| 78 |
+
if not docs:
|
| 79 |
+
return 0
|
| 80 |
+
embs = EMBED_MODEL.encode(docs, normalize_embeddings=True).tolist()
|
| 81 |
+
col.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
|
| 82 |
+
return len(docs)
|
| 83 |
+
|
| 84 |
+
def ingest_files(username: str, notebook_id: str, filepaths: list[str]) -> int:
|
| 85 |
+
ensure_tree(username, notebook_id)
|
| 86 |
+
raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
|
| 87 |
+
ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
|
| 88 |
+
added = 0
|
| 89 |
+
|
| 90 |
+
for fp in filepaths:
|
| 91 |
+
dest = os.path.join(raw_dir, os.path.basename(fp))
|
| 92 |
+
pathlib.Path(dest).write_bytes(pathlib.Path(fp).read_bytes())
|
| 93 |
+
|
| 94 |
+
ext = os.path.splitext(dest)[1].lower()
|
| 95 |
+
if ext == ".pdf":
|
| 96 |
+
extracted = extract_pdf(dest)
|
| 97 |
+
elif ext == ".pptx":
|
| 98 |
+
extracted = extract_pptx(dest)
|
| 99 |
+
elif ext in [".txt", ".md"]:
|
| 100 |
+
extracted = extract_txt(dest)
|
| 101 |
+
else:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
# save extracted
|
| 105 |
+
ex_path = os.path.join(ex_dir, os.path.basename(dest) + ".txt")
|
| 106 |
+
with open(ex_path, "w", encoding="utf-8") as f:
|
| 107 |
+
for item in extracted:
|
| 108 |
+
loc = f"page={item.get('page')}" if item.get("page") else f"slide={item.get('slide')}" if item.get("slide") else ""
|
| 109 |
+
f.write(f"\n--- {loc} ---\n{item['text']}\n")
|
| 110 |
+
|
| 111 |
+
added += upsert_extracted(username, notebook_id, os.path.basename(dest), f"file:{os.path.basename(dest)}", extracted)
|
| 112 |
+
|
| 113 |
+
return added
|
| 114 |
+
|
| 115 |
+
def ingest_url(username: str, notebook_id: str, url: str) -> int:
|
| 116 |
+
ensure_tree(username, notebook_id)
|
| 117 |
+
extracted = extract_url(url)
|
| 118 |
+
ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
|
| 119 |
+
fname = safe_name(url.replace("https://","").replace("http://","").replace("/","_")) + ".txt"
|
| 120 |
+
with open(os.path.join(ex_dir, fname), "w", encoding="utf-8") as f:
|
| 121 |
+
f.write(extracted[0]["text"])
|
| 122 |
+
return upsert_extracted(username, notebook_id, url, f"url:{url}", extracted)
|
src/backend/llm.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from huggingface_hub import InferenceClient
|
| 4 |
+
|
| 5 |
+
HF_INFERENCE_TOKEN = os.environ.get("HF_INFERENCE_TOKEN","").strip()
|
| 6 |
+
HF_LLM_MODEL = os.environ.get("HF_LLM_MODEL","HuggingFaceH4/zephyr-7b-beta").strip()
|
| 7 |
+
|
| 8 |
+
_client = InferenceClient(model=HF_LLM_MODEL, token=HF_INFERENCE_TOKEN) if HF_INFERENCE_TOKEN else None
|
| 9 |
+
|
| 10 |
+
def llm_generate(prompt: str, max_new_tokens=450, temperature=0.2) -> str:
|
| 11 |
+
if _client is None:
|
| 12 |
+
raise gr.Error("HF_INFERENCE_TOKEN not set. Add it in Space secrets.")
|
| 13 |
+
out = _client.text_generation(
|
| 14 |
+
prompt,
|
| 15 |
+
max_new_tokens=max_new_tokens,
|
| 16 |
+
temperature=temperature,
|
| 17 |
+
do_sample=temperature > 0,
|
| 18 |
+
return_full_text=False,
|
| 19 |
+
)
|
| 20 |
+
return (out or "").strip()
|
src/backend/notebooks.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uuid
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from src.storage.index_store import load_index, save_index, list_notebooks
|
| 4 |
+
from src.storage.paths import ensure_tree
|
| 5 |
+
from src.utils.text import safe_name
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
def now_iso():
|
| 9 |
+
return datetime.utcnow().isoformat() + "Z"
|
| 10 |
+
|
| 11 |
+
def create_notebook(username: str, name: str) -> str:
|
| 12 |
+
name = safe_name(name)
|
| 13 |
+
idx = load_index(username)
|
| 14 |
+
nb_id = str(uuid.uuid4())
|
| 15 |
+
idx["notebooks"].append({
|
| 16 |
+
"id": nb_id,
|
| 17 |
+
"name": name,
|
| 18 |
+
"created_at": now_iso(),
|
| 19 |
+
"updated_at": now_iso(),
|
| 20 |
+
})
|
| 21 |
+
save_index(username, idx)
|
| 22 |
+
ensure_tree(username, nb_id)
|
| 23 |
+
return nb_id
|
| 24 |
+
|
| 25 |
+
def rename_notebook(username: str, notebook_id: str, new_name: str):
|
| 26 |
+
new_name = safe_name(new_name)
|
| 27 |
+
if not new_name:
|
| 28 |
+
raise gr.Error("Notebook name cannot be empty.")
|
| 29 |
+
idx = load_index(username)
|
| 30 |
+
found = False
|
| 31 |
+
for nb in idx.get("notebooks", []):
|
| 32 |
+
if nb["id"] == notebook_id:
|
| 33 |
+
nb["name"] = new_name
|
| 34 |
+
nb["updated_at"] = now_iso()
|
| 35 |
+
found = True
|
| 36 |
+
break
|
| 37 |
+
if not found:
|
| 38 |
+
raise gr.Error("Notebook not found.")
|
| 39 |
+
save_index(username, idx)
|
| 40 |
+
|
| 41 |
+
def delete_notebook(username: str, notebook_id: str):
|
| 42 |
+
import shutil, os
|
| 43 |
+
from src.storage.paths import nb_root
|
| 44 |
+
idx = load_index(username)
|
| 45 |
+
idx["notebooks"] = [n for n in idx.get("notebooks", []) if n["id"] != notebook_id]
|
| 46 |
+
save_index(username, idx)
|
| 47 |
+
base = nb_root(username, notebook_id)
|
| 48 |
+
if os.path.exists(base):
|
| 49 |
+
shutil.rmtree(base, ignore_errors=True)
|
src/backend/rag.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from src.storage.chroma_store import get_collection
|
| 3 |
+
from src.backend.llm import llm_generate
|
| 4 |
+
|
| 5 |
+
EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 6 |
+
|
| 7 |
+
def retrieve(username: str, notebook_id: str, query: str, k=6):
|
| 8 |
+
col = get_collection(username, notebook_id)
|
| 9 |
+
qemb = EMBED_MODEL.encode([query], normalize_embeddings=True).tolist()
|
| 10 |
+
res = col.query(query_embeddings=qemb, n_results=k, include=["documents","metadatas","ids"])
|
| 11 |
+
hits = []
|
| 12 |
+
for i in range(len(res["ids"][0])):
|
| 13 |
+
hits.append({"id": res["ids"][0][i], "doc": res["documents"][0][i], "meta": res["metadatas"][0][i]})
|
| 14 |
+
return hits
|
| 15 |
+
|
| 16 |
+
def format_sources(hits):
|
| 17 |
+
lines = []
|
| 18 |
+
for i, h in enumerate(hits, start=1):
|
| 19 |
+
m = h["meta"]
|
| 20 |
+
loc = ""
|
| 21 |
+
if m.get("page"): loc = f"p.{m['page']}"
|
| 22 |
+
if m.get("slide"): loc = f"slide {m['slide']}"
|
| 23 |
+
lines.append(f"[S{i}] {m.get('source_title','source')} {loc}".strip())
|
| 24 |
+
return "\n".join(lines)
|
| 25 |
+
|
| 26 |
+
def context_block(hits):
|
| 27 |
+
blocks = []
|
| 28 |
+
for i, h in enumerate(hits, start=1):
|
| 29 |
+
m = h["meta"]
|
| 30 |
+
loc = ""
|
| 31 |
+
if m.get("page"): loc = f"(page {m['page']})"
|
| 32 |
+
if m.get("slide"): loc = f"(slide {m['slide']})"
|
| 33 |
+
blocks.append(f"[S{i}] {m.get('source_title','source')} {loc}\n{h['doc']}")
|
| 34 |
+
return "\n\n---\n\n".join(blocks)
|
| 35 |
+
|
| 36 |
+
def rag_answer(query: str, hits):
|
| 37 |
+
if not hits:
|
| 38 |
+
return "Not found in the provided sources. (No indexed chunks yet.)"
|
| 39 |
+
prompt = f"""
|
| 40 |
+
You are a research assistant. Answer ONLY using the sources below.
|
| 41 |
+
Every non-trivial claim must end with citations like [S1] or [S2].
|
| 42 |
+
If not present in sources, say: Not found in the provided sources.
|
| 43 |
+
|
| 44 |
+
Question:
|
| 45 |
+
{query}
|
| 46 |
+
|
| 47 |
+
Sources list:
|
| 48 |
+
{format_sources(hits)}
|
| 49 |
+
|
| 50 |
+
Source excerpts:
|
| 51 |
+
{context_block(hits)}
|
| 52 |
+
|
| 53 |
+
Answer with citations:
|
| 54 |
+
"""
|
| 55 |
+
ans = llm_generate(prompt, max_new_tokens=450, temperature=0.2)
|
| 56 |
+
return f"{ans}\n\nSources:\n{format_sources(hits)}"
|
src/frontend/__pycache__/callbacks.cpython-310.pyc
ADDED
|
Binary file (4.91 kB). View file
|
|
|
src/frontend/__pycache__/ui.cpython-310.pyc
ADDED
|
Binary file (3.19 kB). View file
|
|
|
src/frontend/callbacks.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
from src.backend.notebooks import create_notebook, rename_notebook, delete_notebook
|
| 6 |
+
from src.storage.index_store import list_notebooks
|
| 7 |
+
from src.storage.paths import ensure_tree
|
| 8 |
+
from src.storage.chat_store import append_chat, load_chat
|
| 9 |
+
from src.storage.artifact_store import list_artifacts as list_artifacts_store, next_artifact_path
|
| 10 |
+
from src.backend.ingest import ingest_files as ingest_files_backend, ingest_url as ingest_url_backend
|
| 11 |
+
from src.backend.rag import retrieve, rag_answer
|
| 12 |
+
from src.backend.artifacts import generate_report, generate_quiz, generate_podcast_transcript, transcript_to_mp3
|
| 13 |
+
|
| 14 |
+
def now_iso():
|
| 15 |
+
return datetime.utcnow().isoformat() + "Z"
|
| 16 |
+
|
| 17 |
+
def chat_pairs(history):
|
| 18 |
+
pairs = []
|
| 19 |
+
last_user = None
|
| 20 |
+
for m in history:
|
| 21 |
+
if m.get("role") == "user":
|
| 22 |
+
last_user = m.get("content","")
|
| 23 |
+
elif m.get("role") == "assistant":
|
| 24 |
+
pairs.append((last_user or "", m.get("content","")))
|
| 25 |
+
last_user = None
|
| 26 |
+
return pairs
|
| 27 |
+
|
| 28 |
+
def ui_bootstrap(username: str):
|
| 29 |
+
nbs = list_notebooks(username)
|
| 30 |
+
if not nbs:
|
| 31 |
+
nb_id = create_notebook(username, "My First Notebook")
|
| 32 |
+
nbs = list_notebooks(username)
|
| 33 |
+
current = nb_id
|
| 34 |
+
else:
|
| 35 |
+
current = nbs[0][1]
|
| 36 |
+
ensure_tree(username, current)
|
| 37 |
+
history = load_chat(username, current)
|
| 38 |
+
return gr.Dropdown(choices=nbs, value=current), chat_pairs(history), list_artifacts_store(username, current)
|
| 39 |
+
|
| 40 |
+
def on_switch_notebook(username: str, notebook_id: str):
|
| 41 |
+
ensure_tree(username, notebook_id)
|
| 42 |
+
history = load_chat(username, notebook_id)
|
| 43 |
+
return chat_pairs(history), list_artifacts_store(username, notebook_id)
|
| 44 |
+
|
| 45 |
+
def on_create_notebook(username: str, name: str):
|
| 46 |
+
nb_id = create_notebook(username, name)
|
| 47 |
+
nbs = list_notebooks(username)
|
| 48 |
+
return gr.Dropdown(choices=nbs, value=nb_id), [], list_artifacts_store(username, nb_id)
|
| 49 |
+
|
| 50 |
+
def on_rename_notebook(username: str, notebook_id: str, new_name: str):
|
| 51 |
+
rename_notebook(username, notebook_id, new_name)
|
| 52 |
+
return gr.Dropdown(choices=list_notebooks(username), value=notebook_id)
|
| 53 |
+
|
| 54 |
+
def on_delete_notebook(username: str, notebook_id: str):
|
| 55 |
+
delete_notebook(username, notebook_id)
|
| 56 |
+
return ui_bootstrap(username)
|
| 57 |
+
|
| 58 |
+
def on_ingest_files(username: str, notebook_id: str, files):
|
| 59 |
+
if not files:
|
| 60 |
+
raise gr.Error("Upload at least one file.")
|
| 61 |
+
added = ingest_files_backend(username, notebook_id, files)
|
| 62 |
+
return f"Ingested files. Added {added} chunks."
|
| 63 |
+
|
| 64 |
+
def on_ingest_url(username: str, notebook_id: str, url: str):
|
| 65 |
+
url = (url or "").strip()
|
| 66 |
+
if not url:
|
| 67 |
+
raise gr.Error("Enter a URL.")
|
| 68 |
+
added = ingest_url_backend(username, notebook_id, url)
|
| 69 |
+
return f"Ingested URL. Added {added} chunks."
|
| 70 |
+
|
| 71 |
+
def on_chat(username: str, notebook_id: str, chatbot, msg: str):
|
| 72 |
+
msg = (msg or "").strip()
|
| 73 |
+
if not msg:
|
| 74 |
+
return chatbot, ""
|
| 75 |
+
t0 = time.time()
|
| 76 |
+
append_chat(username, notebook_id, {"role":"user","content":msg,"ts":now_iso()})
|
| 77 |
+
hits = retrieve(username, notebook_id, msg, k=6)
|
| 78 |
+
ans = rag_answer(msg, hits)
|
| 79 |
+
append_chat(username, notebook_id, {"role":"assistant","content":ans,"ts":now_iso(),"latency_ms":int((time.time()-t0)*1000)})
|
| 80 |
+
chatbot = chatbot + [(msg, ans)]
|
| 81 |
+
return chatbot, ""
|
| 82 |
+
|
| 83 |
+
def on_report(username: str, notebook_id: str, topic: str, extra: str):
|
| 84 |
+
topic = (topic or "").strip()
|
| 85 |
+
if not topic:
|
| 86 |
+
raise gr.Error("Enter a topic.")
|
| 87 |
+
hits = retrieve(username, notebook_id, topic, k=6)
|
| 88 |
+
if not hits:
|
| 89 |
+
raise gr.Error("No sources yet. Ingest first.")
|
| 90 |
+
md = generate_report(topic, hits, extra)
|
| 91 |
+
out = next_artifact_path(username, notebook_id, "reports", ".md")
|
| 92 |
+
open(out, "w", encoding="utf-8").write(md)
|
| 93 |
+
return "Report generated.", list_artifacts_store(username, notebook_id), out
|
| 94 |
+
|
| 95 |
+
def on_quiz(username: str, notebook_id: str, topic: str, extra: str):
|
| 96 |
+
topic = (topic or "").strip()
|
| 97 |
+
if not topic:
|
| 98 |
+
raise gr.Error("Enter a topic.")
|
| 99 |
+
hits = retrieve(username, notebook_id, topic, k=6)
|
| 100 |
+
if not hits:
|
| 101 |
+
raise gr.Error("No sources yet. Ingest first.")
|
| 102 |
+
md = generate_quiz(topic, hits, extra)
|
| 103 |
+
out = next_artifact_path(username, notebook_id, "quizzes", ".md")
|
| 104 |
+
open(out, "w", encoding="utf-8").write(md)
|
| 105 |
+
return "Quiz generated.", list_artifacts_store(username, notebook_id), out
|
| 106 |
+
|
| 107 |
+
def on_podcast(username: str, notebook_id: str, topic: str, extra: str):
|
| 108 |
+
topic = (topic or "").strip()
|
| 109 |
+
if not topic:
|
| 110 |
+
raise gr.Error("Enter a topic.")
|
| 111 |
+
hits = retrieve(username, notebook_id, topic, k=6)
|
| 112 |
+
if not hits:
|
| 113 |
+
raise gr.Error("No sources yet. Ingest first.")
|
| 114 |
+
md = generate_podcast_transcript(topic, hits, extra)
|
| 115 |
+
md_path = next_artifact_path(username, notebook_id, "podcasts", ".md")
|
| 116 |
+
open(md_path, "w", encoding="utf-8").write(md)
|
| 117 |
+
|
| 118 |
+
mp3_path = next_artifact_path(username, notebook_id, "podcasts", ".mp3")
|
| 119 |
+
transcript_to_mp3(md, mp3_path)
|
| 120 |
+
|
| 121 |
+
return "Podcast generated.", list_artifacts_store(username, notebook_id), md_path, mp3_path
|
| 122 |
+
|
| 123 |
+
def on_download(username: str, notebook_id: str, selection: str):
|
| 124 |
+
import os
|
| 125 |
+
from src.storage.paths import nb_root
|
| 126 |
+
if not selection:
|
| 127 |
+
return None
|
| 128 |
+
p = os.path.join(nb_root(username, notebook_id), "artifacts", selection)
|
| 129 |
+
return p if os.path.exists(p) else None
|
src/frontend/ui.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from src.frontend.callbacks import (
|
| 3 |
+
ui_bootstrap, on_switch_notebook, on_create_notebook, on_rename_notebook, on_delete_notebook,
|
| 4 |
+
on_ingest_files, on_ingest_url, on_chat, on_report, on_quiz, on_podcast, on_download
|
| 5 |
+
)
|
| 6 |
+
from src.backend.auth import require_login
|
| 7 |
+
|
| 8 |
+
def build_app():
|
| 9 |
+
with gr.Blocks(title="NotebookLM Clone") as demo:
|
| 10 |
+
gr.Markdown("# 📓 NotebookLM Clone (HF Auth + Chroma + RAG)")
|
| 11 |
+
|
| 12 |
+
login = gr.LoginButton()
|
| 13 |
+
username_state = gr.State("")
|
| 14 |
+
|
| 15 |
+
def on_load(request: gr.Request):
|
| 16 |
+
username = require_login(request)
|
| 17 |
+
dd, chat, arts = ui_bootstrap(username)
|
| 18 |
+
return username, dd, chat, arts
|
| 19 |
+
|
| 20 |
+
with gr.Row():
|
| 21 |
+
with gr.Column(scale=1):
|
| 22 |
+
user_box = gr.Textbox(label="User", interactive=False)
|
| 23 |
+
notebook_dd = gr.Dropdown(label="Notebooks", choices=[], interactive=True)
|
| 24 |
+
|
| 25 |
+
nb_new = gr.Textbox(label="Create notebook", placeholder="Name")
|
| 26 |
+
btn_create = gr.Button("Create")
|
| 27 |
+
|
| 28 |
+
nb_rename = gr.Textbox(label="Rename notebook", placeholder="New name")
|
| 29 |
+
btn_rename = gr.Button("Rename")
|
| 30 |
+
|
| 31 |
+
btn_delete = gr.Button("Delete current", variant="stop")
|
| 32 |
+
|
| 33 |
+
gr.Markdown("## Ingest")
|
| 34 |
+
file_up = gr.File(label="Upload PDF/PPTX/TXT", file_count="multiple")
|
| 35 |
+
btn_ingest_files = gr.Button("Ingest Files")
|
| 36 |
+
ingest_status = gr.Textbox(label="Status", interactive=False)
|
| 37 |
+
|
| 38 |
+
url_in = gr.Textbox(label="URL", placeholder="https://...")
|
| 39 |
+
btn_ingest_url = gr.Button("Ingest URL")
|
| 40 |
+
url_status = gr.Textbox(label="Status", interactive=False)
|
| 41 |
+
|
| 42 |
+
gr.Markdown("## Artifacts")
|
| 43 |
+
topic = gr.Textbox(label="Topic / prompt")
|
| 44 |
+
extra = gr.Textbox(label="Extra prompt (optional)")
|
| 45 |
+
btn_report = gr.Button("Generate Report")
|
| 46 |
+
btn_quiz = gr.Button("Generate Quiz")
|
| 47 |
+
btn_podcast = gr.Button("Generate Podcast")
|
| 48 |
+
|
| 49 |
+
artifact_status = gr.Textbox(label="Artifact status", interactive=False)
|
| 50 |
+
artifacts_list = gr.Dropdown(label="Artifacts", choices=[], interactive=True)
|
| 51 |
+
download_btn = gr.Button("Download selected")
|
| 52 |
+
download_file = gr.File(label="Download", interactive=False)
|
| 53 |
+
podcast_audio = gr.Audio(label="Podcast Audio", interactive=False)
|
| 54 |
+
|
| 55 |
+
with gr.Column(scale=2):
|
| 56 |
+
chatbot = gr.Chatbot(height=520, label="Chat (RAG + citations)")
|
| 57 |
+
msg = gr.Textbox(label="Message")
|
| 58 |
+
send = gr.Button("Send")
|
| 59 |
+
|
| 60 |
+
demo.load(on_load, inputs=None, outputs=[username_state, notebook_dd, chatbot, artifacts_list], queue=False)
|
| 61 |
+
username_state.change(lambda u: u, inputs=username_state, outputs=user_box, queue=False)
|
| 62 |
+
|
| 63 |
+
notebook_dd.change(on_switch_notebook, inputs=[username_state, notebook_dd], outputs=[chatbot, artifacts_list], queue=False)
|
| 64 |
+
|
| 65 |
+
btn_create.click(on_create_notebook, inputs=[username_state, nb_new], outputs=[notebook_dd, chatbot, artifacts_list], queue=False)
|
| 66 |
+
btn_rename.click(on_rename_notebook, inputs=[username_state, notebook_dd, nb_rename], outputs=[notebook_dd], queue=False)
|
| 67 |
+
btn_delete.click(on_delete_notebook, inputs=[username_state, notebook_dd], outputs=[notebook_dd, chatbot, artifacts_list], queue=False)
|
| 68 |
+
|
| 69 |
+
btn_ingest_files.click(on_ingest_files, inputs=[username_state, notebook_dd, file_up], outputs=[ingest_status], queue=True)
|
| 70 |
+
btn_ingest_url.click(on_ingest_url, inputs=[username_state, notebook_dd, url_in], outputs=[url_status], queue=True)
|
| 71 |
+
|
| 72 |
+
send.click(on_chat, inputs=[username_state, notebook_dd, chatbot, msg], outputs=[chatbot, msg], queue=True)
|
| 73 |
+
|
| 74 |
+
btn_report.click(on_report, inputs=[username_state, notebook_dd, topic, extra], outputs=[artifact_status, artifacts_list, download_file], queue=True)
|
| 75 |
+
btn_quiz.click(on_quiz, inputs=[username_state, notebook_dd, topic, extra], outputs=[artifact_status, artifacts_list, download_file], queue=True)
|
| 76 |
+
btn_podcast.click(on_podcast, inputs=[username_state, notebook_dd, topic, extra], outputs=[artifact_status, artifacts_list, download_file, podcast_audio], queue=True)
|
| 77 |
+
|
| 78 |
+
download_btn.click(on_download, inputs=[username_state, notebook_dd, artifacts_list], outputs=[download_file], queue=False)
|
| 79 |
+
|
| 80 |
+
return demo
|
src/storage/__pycache__/artifact_store.cpython-310.pyc
ADDED
|
Binary file (1.18 kB). View file
|
|
|
src/storage/__pycache__/chat_store.cpython-310.pyc
ADDED
|
Binary file (1.17 kB). View file
|
|
|
src/storage/__pycache__/chroma_store.cpython-310.pyc
ADDED
|
Binary file (830 Bytes). View file
|
|
|
src/storage/__pycache__/index_store.cpython-310.pyc
ADDED
|
Binary file (1.73 kB). View file
|
|
|
src/storage/__pycache__/paths.cpython-310.pyc
ADDED
|
Binary file (1.26 kB). View file
|
|
|
src/storage/artifact_store.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from src.storage.paths import nb_root
|
| 3 |
+
|
| 4 |
+
def list_artifacts(username: str, notebook_id: str):
|
| 5 |
+
base = os.path.join(nb_root(username, notebook_id), "artifacts")
|
| 6 |
+
out = []
|
| 7 |
+
for kind in ["reports","quizzes","podcasts"]:
|
| 8 |
+
kdir = os.path.join(base, kind)
|
| 9 |
+
if not os.path.exists(kdir):
|
| 10 |
+
continue
|
| 11 |
+
for fn in sorted(os.listdir(kdir)):
|
| 12 |
+
out.append(f"{kind}/{fn}")
|
| 13 |
+
return out
|
| 14 |
+
|
| 15 |
+
def next_artifact_path(username: str, notebook_id: str, kind: str, ext: str):
|
| 16 |
+
base = os.path.join(nb_root(username, notebook_id), "artifacts", kind)
|
| 17 |
+
os.makedirs(base, exist_ok=True)
|
| 18 |
+
existing = [p for p in os.listdir(base) if p.endswith(ext)]
|
| 19 |
+
n = len(existing) + 1
|
| 20 |
+
prefix = {"reports":"report","quizzes":"quiz","podcasts":"podcast"}[kind]
|
| 21 |
+
return os.path.join(base, f"{prefix}_{n}{ext}")
|
src/storage/chat_store.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json
|
| 2 |
+
from src.storage.paths import nb_root
|
| 3 |
+
|
| 4 |
+
def chat_path(username: str, notebook_id: str) -> str:
|
| 5 |
+
return os.path.join(nb_root(username, notebook_id), "chat", "messages.jsonl")
|
| 6 |
+
|
| 7 |
+
def append_chat(username: str, notebook_id: str, obj: dict):
|
| 8 |
+
p = chat_path(username, notebook_id)
|
| 9 |
+
os.makedirs(os.path.dirname(p), exist_ok=True)
|
| 10 |
+
with open(p, "a", encoding="utf-8") as f:
|
| 11 |
+
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
|
| 12 |
+
|
| 13 |
+
def load_chat(username: str, notebook_id: str):
|
| 14 |
+
p = chat_path(username, notebook_id)
|
| 15 |
+
if not os.path.exists(p):
|
| 16 |
+
return []
|
| 17 |
+
out = []
|
| 18 |
+
with open(p, "r", encoding="utf-8") as f:
|
| 19 |
+
for line in f:
|
| 20 |
+
try: out.append(json.loads(line))
|
| 21 |
+
except: pass
|
| 22 |
+
return out
|
src/storage/chroma_store.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import chromadb
|
| 3 |
+
from chromadb.config import Settings
|
| 4 |
+
from src.storage.paths import nb_root
|
| 5 |
+
|
| 6 |
+
def chroma_client(username: str, notebook_id: str):
|
| 7 |
+
persist_dir = os.path.join(nb_root(username, notebook_id), "chroma")
|
| 8 |
+
os.makedirs(persist_dir, exist_ok=True)
|
| 9 |
+
return chromadb.PersistentClient(path=persist_dir, settings=Settings(anonymized_telemetry=False))
|
| 10 |
+
|
| 11 |
+
def get_collection(username: str, notebook_id: str):
|
| 12 |
+
client = chroma_client(username, notebook_id)
|
| 13 |
+
return client.get_or_create_collection(name="docs")
|
src/storage/index_store.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from .paths import user_root, index_path, ensure_tree
|
| 4 |
+
|
| 5 |
+
def now_iso():
|
| 6 |
+
return datetime.utcnow().isoformat() + "Z"
|
| 7 |
+
|
| 8 |
+
def load_index(username: str) -> dict:
|
| 9 |
+
os.makedirs(user_root(username), exist_ok=True)
|
| 10 |
+
p = index_path(username)
|
| 11 |
+
if not os.path.exists(p):
|
| 12 |
+
with open(p, "w", encoding="utf-8") as f:
|
| 13 |
+
json.dump({"notebooks": []}, f, indent=2)
|
| 14 |
+
with open(p, "r", encoding="utf-8") as f:
|
| 15 |
+
return json.load(f)
|
| 16 |
+
|
| 17 |
+
def save_index(username: str, idx: dict):
|
| 18 |
+
with open(index_path(username), "w", encoding="utf-8") as f:
|
| 19 |
+
json.dump(idx, f, indent=2)
|
| 20 |
+
|
| 21 |
+
def list_notebooks(username: str):
|
| 22 |
+
idx = load_index(username)
|
| 23 |
+
return [(nb["name"], nb["id"]) for nb in idx.get("notebooks", [])]
|
| 24 |
+
|
| 25 |
+
def touch_updated(username: str, notebook_id: str):
|
| 26 |
+
idx = load_index(username)
|
| 27 |
+
for nb in idx.get("notebooks", []):
|
| 28 |
+
if nb["id"] == notebook_id:
|
| 29 |
+
nb["updated_at"] = now_iso()
|
| 30 |
+
break
|
| 31 |
+
save_index(username, idx)
|
src/storage/paths.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
# If DATA_ROOT env var is not set:
|
| 5 |
+
# - Locally: write to ./data (project folder)
|
| 6 |
+
# - On HF: you will set DATA_ROOT=/data in Space variables (or leave it as /data there)
|
| 7 |
+
DEFAULT_LOCAL_DATA = str(Path(__file__).resolve().parents[2] / "data")
|
| 8 |
+
|
| 9 |
+
DATA_ROOT = os.environ.get("DATA_ROOT", DEFAULT_LOCAL_DATA)
|
| 10 |
+
|
| 11 |
+
def user_root(username: str) -> str:
|
| 12 |
+
return os.path.join(DATA_ROOT, "users", username, "notebooks")
|
| 13 |
+
|
| 14 |
+
def index_path(username: str) -> str:
|
| 15 |
+
return os.path.join(user_root(username), "index.json")
|
| 16 |
+
|
| 17 |
+
def nb_root(username: str, notebook_id: str) -> str:
|
| 18 |
+
return os.path.join(user_root(username), notebook_id)
|
| 19 |
+
|
| 20 |
+
def ensure_tree(username: str, notebook_id: str):
|
| 21 |
+
base = nb_root(username, notebook_id)
|
| 22 |
+
paths = [
|
| 23 |
+
user_root(username),
|
| 24 |
+
os.path.join(base, "files_raw"),
|
| 25 |
+
os.path.join(base, "files_extracted"),
|
| 26 |
+
os.path.join(base, "chroma"),
|
| 27 |
+
os.path.join(base, "chat"),
|
| 28 |
+
os.path.join(base, "artifacts", "reports"),
|
| 29 |
+
os.path.join(base, "artifacts", "quizzes"),
|
| 30 |
+
os.path.join(base, "artifacts", "podcasts"),
|
| 31 |
+
]
|
| 32 |
+
for p in paths:
|
| 33 |
+
os.makedirs(p, exist_ok=True)
|
src/utils/__pycache__/text.cpython-310.pyc
ADDED
|
Binary file (423 Bytes). View file
|
|
|
src/utils/text.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def safe_name(s: str) -> str:
|
| 4 |
+
s = (s or "").strip()
|
| 5 |
+
s = re.sub(r"[^a-zA-Z0-9_\- ]+", "", s)
|
| 6 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 7 |
+
return s[:60] if s else "Untitled"
|
src/utils/timing.py
ADDED
|
File without changes
|