Spaces:
Sleeping
Sleeping
internomega-terrablue commited on
Commit Β·
9f911b3
1
Parent(s): 3780b4d
ingestion changes
Browse files- app.py +11 -0
- ingestion_engine/__init__.py +5 -0
- ingestion_engine/chunker.py +79 -0
- ingestion_engine/embedding_generator.py +37 -0
- ingestion_engine/ingestion_manager.py +101 -0
- ingestion_engine/pdf_extractor.py +15 -0
- ingestion_engine/text_extractor.py +7 -0
- ingestion_engine/transcripter.py +25 -0
- ingestion_engine/url_scrapper.py +21 -0
- pages/sources.py +53 -5
- persistence/__init__.py +5 -0
- persistence/vector_store.py +107 -0
- requirements.txt +7 -0
- state.py +8 -1
- theme.py +9 -0
app.py
CHANGED
|
@@ -25,6 +25,7 @@ from pages.sources import (
|
|
| 25 |
handle_file_upload,
|
| 26 |
handle_url_add,
|
| 27 |
handle_source_delete,
|
|
|
|
| 28 |
)
|
| 29 |
from pages.artifacts import (
|
| 30 |
render_no_sources_gate,
|
|
@@ -431,6 +432,11 @@ with gr.Blocks(css=CUSTOM_CSS, theme=dark_theme, title="NotebookLM") as demo:
|
|
| 431 |
inputs=[file_uploader, user_state],
|
| 432 |
outputs=[user_state, source_list_html, source_header, source_selector],
|
| 433 |
api_name=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
).then(
|
| 435 |
fn=refresh_all,
|
| 436 |
inputs=[user_state],
|
|
@@ -444,6 +450,11 @@ with gr.Blocks(css=CUSTOM_CSS, theme=dark_theme, title="NotebookLM") as demo:
|
|
| 444 |
inputs=[url_input, user_state],
|
| 445 |
outputs=[user_state, source_list_html, source_header, url_input, source_selector],
|
| 446 |
api_name=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
).then(
|
| 448 |
fn=refresh_all,
|
| 449 |
inputs=[user_state],
|
|
|
|
| 25 |
handle_file_upload,
|
| 26 |
handle_url_add,
|
| 27 |
handle_source_delete,
|
| 28 |
+
run_ingestion_pipeline,
|
| 29 |
)
|
| 30 |
from pages.artifacts import (
|
| 31 |
render_no_sources_gate,
|
|
|
|
| 432 |
inputs=[file_uploader, user_state],
|
| 433 |
outputs=[user_state, source_list_html, source_header, source_selector],
|
| 434 |
api_name=False,
|
| 435 |
+
).then(
|
| 436 |
+
fn=run_ingestion_pipeline,
|
| 437 |
+
inputs=[user_state],
|
| 438 |
+
outputs=[user_state, source_list_html, source_header, source_selector],
|
| 439 |
+
api_name=False,
|
| 440 |
).then(
|
| 441 |
fn=refresh_all,
|
| 442 |
inputs=[user_state],
|
|
|
|
| 450 |
inputs=[url_input, user_state],
|
| 451 |
outputs=[user_state, source_list_html, source_header, url_input, source_selector],
|
| 452 |
api_name=False,
|
| 453 |
+
).then(
|
| 454 |
+
fn=run_ingestion_pipeline,
|
| 455 |
+
inputs=[user_state],
|
| 456 |
+
outputs=[user_state, source_list_html, source_header, source_selector],
|
| 457 |
+
api_name=False,
|
| 458 |
).then(
|
| 459 |
fn=refresh_all,
|
| 460 |
inputs=[user_state],
|
ingestion_engine/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ingestion Engine β document processing pipeline."""
|
| 2 |
+
|
| 3 |
+
from ingestion_engine.ingestion_manager import IngestionManager
|
| 4 |
+
|
| 5 |
+
__all__ = ["IngestionManager"]
|
ingestion_engine/chunker.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Recursive character text splitter with overlap."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def chunk_text(
|
| 5 |
+
text: str,
|
| 6 |
+
chunk_size: int = 500,
|
| 7 |
+
chunk_overlap: int = 50,
|
| 8 |
+
) -> list[dict]:
|
| 9 |
+
"""
|
| 10 |
+
Split text into overlapping chunks.
|
| 11 |
+
|
| 12 |
+
Uses character-count heuristic: 1 token ~ 4 characters.
|
| 13 |
+
Splits recursively on paragraph, newline, sentence, then word boundaries.
|
| 14 |
+
|
| 15 |
+
Returns list of {"text": str, "chunk_index": int}.
|
| 16 |
+
"""
|
| 17 |
+
separators = ["\n\n", "\n", ". ", " "]
|
| 18 |
+
char_size = chunk_size * 4
|
| 19 |
+
char_overlap = chunk_overlap * 4
|
| 20 |
+
|
| 21 |
+
raw_chunks = _recursive_split(text, char_size, separators)
|
| 22 |
+
merged = _merge_small_chunks(raw_chunks, char_size)
|
| 23 |
+
overlapped = _apply_overlap(merged, char_overlap)
|
| 24 |
+
|
| 25 |
+
return [
|
| 26 |
+
{"text": chunk.strip(), "chunk_index": i}
|
| 27 |
+
for i, chunk in enumerate(overlapped)
|
| 28 |
+
if chunk.strip()
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _recursive_split(text: str, max_chars: int, separators: list[str]) -> list[str]:
|
| 33 |
+
"""Split text using the first separator that produces sub-max chunks."""
|
| 34 |
+
if len(text) <= max_chars:
|
| 35 |
+
return [text]
|
| 36 |
+
|
| 37 |
+
for sep in separators:
|
| 38 |
+
parts = text.split(sep)
|
| 39 |
+
if len(parts) > 1:
|
| 40 |
+
chunks = []
|
| 41 |
+
for part in parts:
|
| 42 |
+
if len(part) <= max_chars:
|
| 43 |
+
chunks.append(part)
|
| 44 |
+
else:
|
| 45 |
+
remaining_seps = separators[separators.index(sep) + 1 :]
|
| 46 |
+
if remaining_seps:
|
| 47 |
+
chunks.extend(_recursive_split(part, max_chars, remaining_seps))
|
| 48 |
+
else:
|
| 49 |
+
for i in range(0, len(part), max_chars):
|
| 50 |
+
chunks.append(part[i : i + max_chars])
|
| 51 |
+
return chunks
|
| 52 |
+
|
| 53 |
+
return [text[i : i + max_chars] for i in range(0, len(text), max_chars)]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _merge_small_chunks(chunks: list[str], max_chars: int) -> list[str]:
|
| 57 |
+
"""Merge consecutive small chunks that together fit within max_chars."""
|
| 58 |
+
merged = []
|
| 59 |
+
buffer = ""
|
| 60 |
+
for chunk in chunks:
|
| 61 |
+
if buffer and len(buffer) + len(chunk) + 1 > max_chars:
|
| 62 |
+
merged.append(buffer)
|
| 63 |
+
buffer = chunk
|
| 64 |
+
else:
|
| 65 |
+
buffer = (buffer + " " + chunk) if buffer else chunk
|
| 66 |
+
if buffer:
|
| 67 |
+
merged.append(buffer)
|
| 68 |
+
return merged
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def _apply_overlap(chunks: list[str], overlap_chars: int) -> list[str]:
|
| 72 |
+
"""Prepend the tail of the previous chunk to each subsequent chunk."""
|
| 73 |
+
if len(chunks) <= 1 or overlap_chars <= 0:
|
| 74 |
+
return chunks
|
| 75 |
+
result = [chunks[0]]
|
| 76 |
+
for i in range(1, len(chunks)):
|
| 77 |
+
prev_tail = chunks[i - 1][-overlap_chars:]
|
| 78 |
+
result.append(prev_tail + " " + chunks[i])
|
| 79 |
+
return result
|
ingestion_engine/embedding_generator.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Sentence-transformer embedding generation with model caching."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from functools import lru_cache
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
MODEL_NAME = "all-MiniLM-L6-v2"
|
| 9 |
+
EMBEDDING_DIM = 384
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@lru_cache(maxsize=1)
|
| 13 |
+
def get_model():
|
| 14 |
+
"""Load and cache the sentence-transformers model (loaded once per process)."""
|
| 15 |
+
from sentence_transformers import SentenceTransformer
|
| 16 |
+
|
| 17 |
+
logger.info("Loading embedding model: %s", MODEL_NAME)
|
| 18 |
+
model = SentenceTransformer(MODEL_NAME)
|
| 19 |
+
logger.info("Embedding model loaded. Dimension: %d", EMBEDDING_DIM)
|
| 20 |
+
return model
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def generate(texts: list[str], batch_size: int = 64) -> list[list[float]]:
|
| 24 |
+
"""Encode a list of texts into normalized embedding vectors."""
|
| 25 |
+
model = get_model()
|
| 26 |
+
embeddings = model.encode(
|
| 27 |
+
texts,
|
| 28 |
+
batch_size=batch_size,
|
| 29 |
+
show_progress_bar=False,
|
| 30 |
+
normalize_embeddings=True,
|
| 31 |
+
)
|
| 32 |
+
return embeddings.tolist()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def generate_query(query: str) -> list[float]:
|
| 36 |
+
"""Embed a single query string (convenience wrapper for future RAG Engine)."""
|
| 37 |
+
return generate([query])[0]
|
ingestion_engine/ingestion_manager.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ingestion Manager β orchestrates the full source processing pipeline."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
from ingestion_engine import pdf_extractor, text_extractor, url_scrapper, transcripter
|
| 6 |
+
from ingestion_engine.chunker import chunk_text
|
| 7 |
+
from ingestion_engine.embedding_generator import generate
|
| 8 |
+
from persistence.vector_store import VectorStore
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
EXTRACTORS = {
|
| 13 |
+
"pdf": lambda fp, _url: pdf_extractor.extract(fp),
|
| 14 |
+
"txt": lambda fp, _url: text_extractor.extract(fp),
|
| 15 |
+
"pptx": lambda fp, _url: _extract_pptx(fp),
|
| 16 |
+
"url": lambda _fp, url: url_scrapper.extract(url),
|
| 17 |
+
"youtube": lambda _fp, url: transcripter.extract(url),
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _extract_pptx(file_path: str) -> str:
|
| 22 |
+
"""Extract text from a PPTX file (lazy import to keep python-pptx optional)."""
|
| 23 |
+
from pptx import Presentation
|
| 24 |
+
|
| 25 |
+
prs = Presentation(file_path)
|
| 26 |
+
texts = []
|
| 27 |
+
for slide in prs.slides:
|
| 28 |
+
for shape in slide.shapes:
|
| 29 |
+
if shape.has_text_frame:
|
| 30 |
+
for paragraph in shape.text_frame.paragraphs:
|
| 31 |
+
text = paragraph.text.strip()
|
| 32 |
+
if text:
|
| 33 |
+
texts.append(text)
|
| 34 |
+
return "\n\n".join(texts)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class IngestionManager:
|
| 38 |
+
"""Orchestrates: extract -> chunk -> embed -> store in Pinecone."""
|
| 39 |
+
|
| 40 |
+
def __init__(self):
|
| 41 |
+
self.vector_store = VectorStore()
|
| 42 |
+
|
| 43 |
+
def process_source(self, source, file_path: str | None, notebook_id: str) -> tuple[int, str | None]:
|
| 44 |
+
"""
|
| 45 |
+
Run the full ingestion pipeline for a single source.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
source: state.Source object
|
| 49 |
+
file_path: local file path (None for URL/YouTube sources)
|
| 50 |
+
notebook_id: used as Pinecone namespace
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
(chunk_count, error_message) β error_message is None on success
|
| 54 |
+
"""
|
| 55 |
+
try:
|
| 56 |
+
# Step 1: Extract text
|
| 57 |
+
extractor = EXTRACTORS.get(source.file_type)
|
| 58 |
+
if not extractor:
|
| 59 |
+
return 0, f"Unsupported file type: {source.file_type}"
|
| 60 |
+
|
| 61 |
+
raw_text = extractor(file_path, source.source_url)
|
| 62 |
+
if not raw_text or not raw_text.strip():
|
| 63 |
+
return 0, "No text could be extracted from this source."
|
| 64 |
+
|
| 65 |
+
logger.info("Extracted %d characters from %s", len(raw_text), source.filename)
|
| 66 |
+
|
| 67 |
+
# Step 2: Chunk
|
| 68 |
+
chunks = chunk_text(raw_text, chunk_size=500, chunk_overlap=50)
|
| 69 |
+
if not chunks:
|
| 70 |
+
return 0, "Text was extracted but produced no usable chunks."
|
| 71 |
+
|
| 72 |
+
logger.info("Created %d chunks from %s", len(chunks), source.filename)
|
| 73 |
+
|
| 74 |
+
# Step 3: Embed
|
| 75 |
+
chunk_texts = [c["text"] for c in chunks]
|
| 76 |
+
vectors = generate(chunk_texts)
|
| 77 |
+
|
| 78 |
+
logger.info("Generated %d embeddings for %s", len(vectors), source.filename)
|
| 79 |
+
|
| 80 |
+
# Step 4: Prepare records and upsert to Pinecone
|
| 81 |
+
records = []
|
| 82 |
+
for chunk, vector in zip(chunks, vectors):
|
| 83 |
+
records.append({
|
| 84 |
+
"id": f"{source.id}_{chunk['chunk_index']}",
|
| 85 |
+
"values": vector,
|
| 86 |
+
"metadata": {
|
| 87 |
+
"source_id": source.id,
|
| 88 |
+
"source_filename": source.filename,
|
| 89 |
+
"chunk_index": chunk["chunk_index"],
|
| 90 |
+
"text": chunk["text"][:1000],
|
| 91 |
+
},
|
| 92 |
+
})
|
| 93 |
+
|
| 94 |
+
self.vector_store.upsert(records, namespace=notebook_id)
|
| 95 |
+
|
| 96 |
+
logger.info("Stored %d vectors for %s in namespace %s", len(records), source.filename, notebook_id)
|
| 97 |
+
return len(chunks), None
|
| 98 |
+
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.error("Ingestion failed for %s: %s", source.filename, e)
|
| 101 |
+
return 0, f"Ingestion error: {str(e)}"
|
ingestion_engine/pdf_extractor.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PDF text extraction using PyMuPDF."""
|
| 2 |
+
|
| 3 |
+
import fitz
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def extract(file_path: str) -> str:
|
| 7 |
+
"""Extract text from all pages of a PDF file."""
|
| 8 |
+
doc = fitz.open(file_path)
|
| 9 |
+
pages = []
|
| 10 |
+
for page in doc:
|
| 11 |
+
text = page.get_text()
|
| 12 |
+
if text.strip():
|
| 13 |
+
pages.append(text)
|
| 14 |
+
doc.close()
|
| 15 |
+
return "\n\n".join(pages)
|
ingestion_engine/text_extractor.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Plain text file extraction."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def extract(file_path: str) -> str:
|
| 5 |
+
"""Read a plain text file as UTF-8."""
|
| 6 |
+
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
|
| 7 |
+
return f.read()
|
ingestion_engine/transcripter.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""YouTube video transcript extraction."""
|
| 2 |
+
|
| 3 |
+
from urllib.parse import urlparse, parse_qs
|
| 4 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def extract(url: str) -> str:
|
| 8 |
+
"""Fetch the transcript for a YouTube video."""
|
| 9 |
+
video_id = _parse_video_id(url)
|
| 10 |
+
if not video_id:
|
| 11 |
+
raise ValueError(f"Could not parse YouTube video ID from: {url}")
|
| 12 |
+
|
| 13 |
+
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
|
| 14 |
+
return " ".join(entry["text"] for entry in transcript_list)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _parse_video_id(url: str) -> str | None:
|
| 18 |
+
"""Extract video ID from youtube.com/watch?v=... or youtu.be/... URLs."""
|
| 19 |
+
parsed = urlparse(url)
|
| 20 |
+
hostname = parsed.hostname or ""
|
| 21 |
+
if "youtu.be" in hostname:
|
| 22 |
+
return parsed.path.lstrip("/")
|
| 23 |
+
if "youtube.com" in hostname:
|
| 24 |
+
return parse_qs(parsed.query).get("v", [None])[0]
|
| 25 |
+
return None
|
ingestion_engine/url_scrapper.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Web page text extraction via requests + BeautifulSoup."""
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def extract(url: str) -> str:
|
| 8 |
+
"""Fetch a web page and extract its main text content."""
|
| 9 |
+
headers = {"User-Agent": "Mozilla/5.0 (NotebookLM Bot)"}
|
| 10 |
+
response = requests.get(url, headers=headers, timeout=15)
|
| 11 |
+
response.raise_for_status()
|
| 12 |
+
|
| 13 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 14 |
+
|
| 15 |
+
# Remove non-content elements
|
| 16 |
+
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
| 17 |
+
tag.decompose()
|
| 18 |
+
|
| 19 |
+
text = soup.get_text(separator="\n", strip=True)
|
| 20 |
+
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 21 |
+
return "\n".join(lines)
|
pages/sources.py
CHANGED
|
@@ -59,6 +59,14 @@ def render_source_list(state: UserData) -> str:
|
|
| 59 |
meta_parts.append(f"{source.chunk_count} chunks")
|
| 60 |
meta_str = " Β· ".join(meta_parts)
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
html += (
|
| 63 |
f'<div class="source-card">'
|
| 64 |
f'<div class="source-icon {ft}">{cfg["icon"]}</div>'
|
|
@@ -66,7 +74,7 @@ def render_source_list(state: UserData) -> str:
|
|
| 66 |
f'<div class="name">{source.filename}</div>'
|
| 67 |
f'<div class="meta">{meta_str}</div>'
|
| 68 |
f'</div>'
|
| 69 |
-
f'
|
| 70 |
f'</div>'
|
| 71 |
)
|
| 72 |
return html
|
|
@@ -100,13 +108,14 @@ def handle_file_upload(files, state: UserData) -> tuple[UserData, str, str, list
|
|
| 100 |
if file_ext not in ALLOWED_TYPES:
|
| 101 |
continue
|
| 102 |
|
| 103 |
-
# Get file size
|
| 104 |
try:
|
| 105 |
import os
|
| 106 |
file_path = f.name if hasattr(f, 'name') else str(f)
|
| 107 |
size_bytes = os.path.getsize(file_path)
|
| 108 |
size_mb = round(size_bytes / (1024 * 1024), 2)
|
| 109 |
except Exception:
|
|
|
|
| 110 |
size_mb = 0
|
| 111 |
|
| 112 |
if size_mb > MAX_FILE_SIZE_MB:
|
|
@@ -119,9 +128,10 @@ def handle_file_upload(files, state: UserData) -> tuple[UserData, str, str, list
|
|
| 119 |
size_mb=size_mb,
|
| 120 |
source_url=None,
|
| 121 |
chunk_count=0,
|
| 122 |
-
status="
|
| 123 |
error_message=None,
|
| 124 |
created_at=datetime.now().isoformat(),
|
|
|
|
| 125 |
)
|
| 126 |
nb.sources.append(source)
|
| 127 |
|
|
@@ -153,9 +163,10 @@ def handle_url_add(url: str, state: UserData) -> tuple[UserData, str, str, str,
|
|
| 153 |
size_mb=None,
|
| 154 |
source_url=url,
|
| 155 |
chunk_count=0,
|
| 156 |
-
status="
|
| 157 |
error_message=None,
|
| 158 |
created_at=datetime.now().isoformat(),
|
|
|
|
| 159 |
)
|
| 160 |
nb.sources.append(source)
|
| 161 |
|
|
@@ -163,10 +174,47 @@ def handle_url_add(url: str, state: UserData) -> tuple[UserData, str, str, str,
|
|
| 163 |
|
| 164 |
|
| 165 |
def handle_source_delete(source_name: str, state: UserData) -> tuple[UserData, str, str, list[str]]:
|
| 166 |
-
"""Delete a source by filename
|
| 167 |
nb = get_active_notebook(state)
|
| 168 |
if not nb or not source_name:
|
| 169 |
return state, render_source_list(state), render_source_header(state), get_source_choices(state)
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
nb.sources = [s for s in nb.sources if s.filename != source_name]
|
| 172 |
return state, render_source_list(state), render_source_header(state), get_source_choices(state)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
meta_parts.append(f"{source.chunk_count} chunks")
|
| 60 |
meta_str = " Β· ".join(meta_parts)
|
| 61 |
|
| 62 |
+
if source.status == "processing":
|
| 63 |
+
badge = '<span class="source-badge processing">Processing...</span>'
|
| 64 |
+
elif source.status == "failed":
|
| 65 |
+
err = source.error_message or "Unknown error"
|
| 66 |
+
badge = f'<span class="source-badge failed" title="{err}">Failed</span>'
|
| 67 |
+
else:
|
| 68 |
+
badge = '<span class="source-badge ready">Ready</span>'
|
| 69 |
+
|
| 70 |
html += (
|
| 71 |
f'<div class="source-card">'
|
| 72 |
f'<div class="source-icon {ft}">{cfg["icon"]}</div>'
|
|
|
|
| 74 |
f'<div class="name">{source.filename}</div>'
|
| 75 |
f'<div class="meta">{meta_str}</div>'
|
| 76 |
f'</div>'
|
| 77 |
+
f'{badge}'
|
| 78 |
f'</div>'
|
| 79 |
)
|
| 80 |
return html
|
|
|
|
| 108 |
if file_ext not in ALLOWED_TYPES:
|
| 109 |
continue
|
| 110 |
|
| 111 |
+
# Get file size and path
|
| 112 |
try:
|
| 113 |
import os
|
| 114 |
file_path = f.name if hasattr(f, 'name') else str(f)
|
| 115 |
size_bytes = os.path.getsize(file_path)
|
| 116 |
size_mb = round(size_bytes / (1024 * 1024), 2)
|
| 117 |
except Exception:
|
| 118 |
+
file_path = None
|
| 119 |
size_mb = 0
|
| 120 |
|
| 121 |
if size_mb > MAX_FILE_SIZE_MB:
|
|
|
|
| 128 |
size_mb=size_mb,
|
| 129 |
source_url=None,
|
| 130 |
chunk_count=0,
|
| 131 |
+
status="processing",
|
| 132 |
error_message=None,
|
| 133 |
created_at=datetime.now().isoformat(),
|
| 134 |
+
file_path=file_path,
|
| 135 |
)
|
| 136 |
nb.sources.append(source)
|
| 137 |
|
|
|
|
| 163 |
size_mb=None,
|
| 164 |
source_url=url,
|
| 165 |
chunk_count=0,
|
| 166 |
+
status="processing",
|
| 167 |
error_message=None,
|
| 168 |
created_at=datetime.now().isoformat(),
|
| 169 |
+
file_path=None,
|
| 170 |
)
|
| 171 |
nb.sources.append(source)
|
| 172 |
|
|
|
|
| 174 |
|
| 175 |
|
| 176 |
def handle_source_delete(source_name: str, state: UserData) -> tuple[UserData, str, str, list[str]]:
|
| 177 |
+
"""Delete a source by filename and remove its vectors from Pinecone."""
|
| 178 |
nb = get_active_notebook(state)
|
| 179 |
if not nb or not source_name:
|
| 180 |
return state, render_source_list(state), render_source_header(state), get_source_choices(state)
|
| 181 |
|
| 182 |
+
# Delete vectors from Pinecone before removing from state
|
| 183 |
+
source_to_delete = next((s for s in nb.sources if s.filename == source_name), None)
|
| 184 |
+
if source_to_delete:
|
| 185 |
+
try:
|
| 186 |
+
from persistence.vector_store import VectorStore
|
| 187 |
+
VectorStore().delete_by_source(source_to_delete.id, nb.id)
|
| 188 |
+
except Exception:
|
| 189 |
+
pass # Best-effort; source removed from UI regardless
|
| 190 |
+
|
| 191 |
nb.sources = [s for s in nb.sources if s.filename != source_name]
|
| 192 |
return state, render_source_list(state), render_source_header(state), get_source_choices(state)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def run_ingestion_pipeline(state: UserData) -> tuple[UserData, str, str, list[str]]:
|
| 196 |
+
"""Process all sources with status='processing' through the ingestion engine."""
|
| 197 |
+
from ingestion_engine import IngestionManager
|
| 198 |
+
|
| 199 |
+
nb = get_active_notebook(state)
|
| 200 |
+
if not nb:
|
| 201 |
+
return state, render_source_list(state), render_source_header(state), get_source_choices(state)
|
| 202 |
+
|
| 203 |
+
manager = IngestionManager()
|
| 204 |
+
|
| 205 |
+
for source in nb.sources:
|
| 206 |
+
if source.status != "processing":
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
chunk_count, error = manager.process_source(source, source.file_path, nb.id)
|
| 210 |
+
|
| 211 |
+
if error:
|
| 212 |
+
source.status = "failed"
|
| 213 |
+
source.error_message = error
|
| 214 |
+
source.chunk_count = 0
|
| 215 |
+
else:
|
| 216 |
+
source.status = "ready"
|
| 217 |
+
source.error_message = None
|
| 218 |
+
source.chunk_count = chunk_count
|
| 219 |
+
|
| 220 |
+
return state, render_source_list(state), render_source_header(state), get_source_choices(state)
|
persistence/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Persistence Layer β storage services."""
|
| 2 |
+
|
| 3 |
+
from persistence.vector_store import VectorStore
|
| 4 |
+
|
| 5 |
+
__all__ = ["VectorStore"]
|
persistence/vector_store.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pinecone vector store operations."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
INDEX_NAME = "notebooklm"
|
| 9 |
+
UPSERT_BATCH_SIZE = 100
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class VectorStore:
|
| 13 |
+
"""Pinecone client for upserting, deleting, and querying vectors."""
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
self._index = None
|
| 17 |
+
|
| 18 |
+
def _get_index(self):
|
| 19 |
+
"""Lazy-initialize the Pinecone index connection."""
|
| 20 |
+
if self._index is not None:
|
| 21 |
+
return self._index
|
| 22 |
+
|
| 23 |
+
from pinecone import Pinecone
|
| 24 |
+
|
| 25 |
+
api_key = os.environ.get("PINECONE_API_KEY")
|
| 26 |
+
if not api_key:
|
| 27 |
+
raise RuntimeError(
|
| 28 |
+
"PINECONE_API_KEY not found in environment. "
|
| 29 |
+
"Add it as a Secret in your HF Space settings."
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
pc = Pinecone(api_key=api_key)
|
| 33 |
+
self._index = pc.Index(INDEX_NAME)
|
| 34 |
+
logger.info("Connected to Pinecone index: %s", INDEX_NAME)
|
| 35 |
+
return self._index
|
| 36 |
+
|
| 37 |
+
def upsert(self, records: list[dict], namespace: str) -> int:
|
| 38 |
+
"""
|
| 39 |
+
Upsert embedding records into Pinecone in batches.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
records: List of {"id": str, "values": list[float], "metadata": dict}
|
| 43 |
+
namespace: Pinecone namespace (notebook_id)
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
Number of vectors upserted
|
| 47 |
+
"""
|
| 48 |
+
index = self._get_index()
|
| 49 |
+
total = 0
|
| 50 |
+
for i in range(0, len(records), UPSERT_BATCH_SIZE):
|
| 51 |
+
batch = records[i : i + UPSERT_BATCH_SIZE]
|
| 52 |
+
index.upsert(vectors=batch, namespace=namespace)
|
| 53 |
+
total += len(batch)
|
| 54 |
+
logger.info("Upserted %d vectors to namespace '%s'", total, namespace)
|
| 55 |
+
return total
|
| 56 |
+
|
| 57 |
+
def delete_by_source(self, source_id: str, namespace: str) -> None:
|
| 58 |
+
"""Delete all vectors belonging to a specific source."""
|
| 59 |
+
try:
|
| 60 |
+
index = self._get_index()
|
| 61 |
+
index.delete(
|
| 62 |
+
filter={"source_id": {"$eq": source_id}},
|
| 63 |
+
namespace=namespace,
|
| 64 |
+
)
|
| 65 |
+
logger.info("Deleted vectors for source '%s' from namespace '%s'", source_id, namespace)
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.error("Failed to delete vectors from Pinecone: %s", e)
|
| 68 |
+
|
| 69 |
+
def delete_namespace(self, namespace: str) -> None:
|
| 70 |
+
"""Delete all vectors in a namespace (when a notebook is deleted)."""
|
| 71 |
+
try:
|
| 72 |
+
index = self._get_index()
|
| 73 |
+
index.delete(delete_all=True, namespace=namespace)
|
| 74 |
+
logger.info("Deleted entire namespace '%s'", namespace)
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.error("Failed to delete namespace from Pinecone: %s", e)
|
| 77 |
+
|
| 78 |
+
def query(self, query_vector: list[float], namespace: str, top_k: int = 5) -> list[dict]:
|
| 79 |
+
"""
|
| 80 |
+
Query Pinecone for the most similar chunks.
|
| 81 |
+
|
| 82 |
+
Returns list of {"text", "source_id", "source_filename", "chunk_index", "score"}.
|
| 83 |
+
"""
|
| 84 |
+
try:
|
| 85 |
+
index = self._get_index()
|
| 86 |
+
results = index.query(
|
| 87 |
+
vector=query_vector,
|
| 88 |
+
namespace=namespace,
|
| 89 |
+
top_k=top_k,
|
| 90 |
+
include_metadata=True,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
matches = []
|
| 94 |
+
for match in results.get("matches", []):
|
| 95 |
+
meta = match.get("metadata", {})
|
| 96 |
+
matches.append({
|
| 97 |
+
"text": meta.get("text", ""),
|
| 98 |
+
"source_id": meta.get("source_id", ""),
|
| 99 |
+
"source_filename": meta.get("source_filename", ""),
|
| 100 |
+
"chunk_index": meta.get("chunk_index", 0),
|
| 101 |
+
"score": match.get("score", 0.0),
|
| 102 |
+
})
|
| 103 |
+
return matches
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.error("Pinecone query failed: %s", e)
|
| 107 |
+
return []
|
requirements.txt
CHANGED
|
@@ -1 +1,8 @@
|
|
| 1 |
gradio>=5.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio>=5.0.0
|
| 2 |
+
sentence-transformers>=2.2.0
|
| 3 |
+
pinecone-client>=3.0.0
|
| 4 |
+
PyMuPDF>=1.23.0
|
| 5 |
+
python-pptx>=0.6.21
|
| 6 |
+
beautifulsoup4>=4.12.0
|
| 7 |
+
requests>=2.31.0
|
| 8 |
+
youtube-transcript-api>=0.6.0
|
state.py
CHANGED
|
@@ -5,7 +5,7 @@ import uuid
|
|
| 5 |
|
| 6 |
|
| 7 |
class Source:
|
| 8 |
-
def __init__(self, id, filename, file_type, size_mb, source_url, chunk_count, status, error_message, created_at):
|
| 9 |
self.id = id
|
| 10 |
self.filename = filename
|
| 11 |
self.file_type = file_type # "pdf", "pptx", "txt", "url", "youtube"
|
|
@@ -15,6 +15,7 @@ class Source:
|
|
| 15 |
self.status = status # "ready", "processing", "failed"
|
| 16 |
self.error_message = error_message
|
| 17 |
self.created_at = created_at
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
class Message:
|
|
@@ -88,6 +89,12 @@ def create_notebook(state, title):
|
|
| 88 |
|
| 89 |
def delete_notebook(state, nb_id):
|
| 90 |
if nb_id in state.notebooks:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
del state.notebooks[nb_id]
|
| 92 |
remaining = list(state.notebooks.keys())
|
| 93 |
state.active_notebook_id = remaining[0] if remaining else None
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
class Source:
|
| 8 |
+
def __init__(self, id, filename, file_type, size_mb, source_url, chunk_count, status, error_message, created_at, file_path=None):
|
| 9 |
self.id = id
|
| 10 |
self.filename = filename
|
| 11 |
self.file_type = file_type # "pdf", "pptx", "txt", "url", "youtube"
|
|
|
|
| 15 |
self.status = status # "ready", "processing", "failed"
|
| 16 |
self.error_message = error_message
|
| 17 |
self.created_at = created_at
|
| 18 |
+
self.file_path = file_path
|
| 19 |
|
| 20 |
|
| 21 |
class Message:
|
|
|
|
| 89 |
|
| 90 |
def delete_notebook(state, nb_id):
|
| 91 |
if nb_id in state.notebooks:
|
| 92 |
+
# Clean up Pinecone vectors for this notebook
|
| 93 |
+
try:
|
| 94 |
+
from persistence.vector_store import VectorStore
|
| 95 |
+
VectorStore().delete_namespace(nb_id)
|
| 96 |
+
except Exception:
|
| 97 |
+
pass # Best-effort cleanup
|
| 98 |
del state.notebooks[nb_id]
|
| 99 |
remaining = list(state.notebooks.keys())
|
| 100 |
state.active_notebook_id = remaining[0] if remaining else None
|
theme.py
CHANGED
|
@@ -220,6 +220,15 @@ CUSTOM_CSS = """
|
|
| 220 |
font-weight: 600; letter-spacing: 0.3px;
|
| 221 |
}
|
| 222 |
.source-badge.ready { background: rgba(34,197,94,0.15); color: #22c55e; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
/* ββ Welcome hero ββ */
|
| 225 |
.welcome-hero {
|
|
|
|
| 220 |
font-weight: 600; letter-spacing: 0.3px;
|
| 221 |
}
|
| 222 |
.source-badge.ready { background: rgba(34,197,94,0.15); color: #22c55e; }
|
| 223 |
+
.source-badge.processing {
|
| 224 |
+
background: rgba(234,179,8,0.15); color: #eab308;
|
| 225 |
+
animation: pulse-badge 1.5s ease-in-out infinite;
|
| 226 |
+
}
|
| 227 |
+
.source-badge.failed { background: rgba(239,68,68,0.15); color: #ef4444; cursor: help; }
|
| 228 |
+
@keyframes pulse-badge {
|
| 229 |
+
0%, 100% { opacity: 1; }
|
| 230 |
+
50% { opacity: 0.5; }
|
| 231 |
+
}
|
| 232 |
|
| 233 |
/* ββ Welcome hero ββ */
|
| 234 |
.welcome-hero {
|