internomega-terrablue commited on
Commit
9f911b3
Β·
1 Parent(s): 3780b4d

ingestion changes

Browse files
app.py CHANGED
@@ -25,6 +25,7 @@ from pages.sources import (
25
  handle_file_upload,
26
  handle_url_add,
27
  handle_source_delete,
 
28
  )
29
  from pages.artifacts import (
30
  render_no_sources_gate,
@@ -431,6 +432,11 @@ with gr.Blocks(css=CUSTOM_CSS, theme=dark_theme, title="NotebookLM") as demo:
431
  inputs=[file_uploader, user_state],
432
  outputs=[user_state, source_list_html, source_header, source_selector],
433
  api_name=False,
 
 
 
 
 
434
  ).then(
435
  fn=refresh_all,
436
  inputs=[user_state],
@@ -444,6 +450,11 @@ with gr.Blocks(css=CUSTOM_CSS, theme=dark_theme, title="NotebookLM") as demo:
444
  inputs=[url_input, user_state],
445
  outputs=[user_state, source_list_html, source_header, url_input, source_selector],
446
  api_name=False,
 
 
 
 
 
447
  ).then(
448
  fn=refresh_all,
449
  inputs=[user_state],
 
25
  handle_file_upload,
26
  handle_url_add,
27
  handle_source_delete,
28
+ run_ingestion_pipeline,
29
  )
30
  from pages.artifacts import (
31
  render_no_sources_gate,
 
432
  inputs=[file_uploader, user_state],
433
  outputs=[user_state, source_list_html, source_header, source_selector],
434
  api_name=False,
435
+ ).then(
436
+ fn=run_ingestion_pipeline,
437
+ inputs=[user_state],
438
+ outputs=[user_state, source_list_html, source_header, source_selector],
439
+ api_name=False,
440
  ).then(
441
  fn=refresh_all,
442
  inputs=[user_state],
 
450
  inputs=[url_input, user_state],
451
  outputs=[user_state, source_list_html, source_header, url_input, source_selector],
452
  api_name=False,
453
+ ).then(
454
+ fn=run_ingestion_pipeline,
455
+ inputs=[user_state],
456
+ outputs=[user_state, source_list_html, source_header, source_selector],
457
+ api_name=False,
458
  ).then(
459
  fn=refresh_all,
460
  inputs=[user_state],
ingestion_engine/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Ingestion Engine β€” document processing pipeline."""
2
+
3
+ from ingestion_engine.ingestion_manager import IngestionManager
4
+
5
+ __all__ = ["IngestionManager"]
ingestion_engine/chunker.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Recursive character text splitter with overlap."""
2
+
3
+
4
+ def chunk_text(
5
+ text: str,
6
+ chunk_size: int = 500,
7
+ chunk_overlap: int = 50,
8
+ ) -> list[dict]:
9
+ """
10
+ Split text into overlapping chunks.
11
+
12
+ Uses character-count heuristic: 1 token ~ 4 characters.
13
+ Splits recursively on paragraph, newline, sentence, then word boundaries.
14
+
15
+ Returns list of {"text": str, "chunk_index": int}.
16
+ """
17
+ separators = ["\n\n", "\n", ". ", " "]
18
+ char_size = chunk_size * 4
19
+ char_overlap = chunk_overlap * 4
20
+
21
+ raw_chunks = _recursive_split(text, char_size, separators)
22
+ merged = _merge_small_chunks(raw_chunks, char_size)
23
+ overlapped = _apply_overlap(merged, char_overlap)
24
+
25
+ return [
26
+ {"text": chunk.strip(), "chunk_index": i}
27
+ for i, chunk in enumerate(overlapped)
28
+ if chunk.strip()
29
+ ]
30
+
31
+
32
+ def _recursive_split(text: str, max_chars: int, separators: list[str]) -> list[str]:
33
+ """Split text using the first separator that produces sub-max chunks."""
34
+ if len(text) <= max_chars:
35
+ return [text]
36
+
37
+ for sep in separators:
38
+ parts = text.split(sep)
39
+ if len(parts) > 1:
40
+ chunks = []
41
+ for part in parts:
42
+ if len(part) <= max_chars:
43
+ chunks.append(part)
44
+ else:
45
+ remaining_seps = separators[separators.index(sep) + 1 :]
46
+ if remaining_seps:
47
+ chunks.extend(_recursive_split(part, max_chars, remaining_seps))
48
+ else:
49
+ for i in range(0, len(part), max_chars):
50
+ chunks.append(part[i : i + max_chars])
51
+ return chunks
52
+
53
+ return [text[i : i + max_chars] for i in range(0, len(text), max_chars)]
54
+
55
+
56
+ def _merge_small_chunks(chunks: list[str], max_chars: int) -> list[str]:
57
+ """Merge consecutive small chunks that together fit within max_chars."""
58
+ merged = []
59
+ buffer = ""
60
+ for chunk in chunks:
61
+ if buffer and len(buffer) + len(chunk) + 1 > max_chars:
62
+ merged.append(buffer)
63
+ buffer = chunk
64
+ else:
65
+ buffer = (buffer + " " + chunk) if buffer else chunk
66
+ if buffer:
67
+ merged.append(buffer)
68
+ return merged
69
+
70
+
71
+ def _apply_overlap(chunks: list[str], overlap_chars: int) -> list[str]:
72
+ """Prepend the tail of the previous chunk to each subsequent chunk."""
73
+ if len(chunks) <= 1 or overlap_chars <= 0:
74
+ return chunks
75
+ result = [chunks[0]]
76
+ for i in range(1, len(chunks)):
77
+ prev_tail = chunks[i - 1][-overlap_chars:]
78
+ result.append(prev_tail + " " + chunks[i])
79
+ return result
ingestion_engine/embedding_generator.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sentence-transformer embedding generation with model caching."""
2
+
3
+ import logging
4
+ from functools import lru_cache
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ MODEL_NAME = "all-MiniLM-L6-v2"
9
+ EMBEDDING_DIM = 384
10
+
11
+
12
+ @lru_cache(maxsize=1)
13
+ def get_model():
14
+ """Load and cache the sentence-transformers model (loaded once per process)."""
15
+ from sentence_transformers import SentenceTransformer
16
+
17
+ logger.info("Loading embedding model: %s", MODEL_NAME)
18
+ model = SentenceTransformer(MODEL_NAME)
19
+ logger.info("Embedding model loaded. Dimension: %d", EMBEDDING_DIM)
20
+ return model
21
+
22
+
23
+ def generate(texts: list[str], batch_size: int = 64) -> list[list[float]]:
24
+ """Encode a list of texts into normalized embedding vectors."""
25
+ model = get_model()
26
+ embeddings = model.encode(
27
+ texts,
28
+ batch_size=batch_size,
29
+ show_progress_bar=False,
30
+ normalize_embeddings=True,
31
+ )
32
+ return embeddings.tolist()
33
+
34
+
35
+ def generate_query(query: str) -> list[float]:
36
+ """Embed a single query string (convenience wrapper for future RAG Engine)."""
37
+ return generate([query])[0]
ingestion_engine/ingestion_manager.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ingestion Manager β€” orchestrates the full source processing pipeline."""
2
+
3
+ import logging
4
+
5
+ from ingestion_engine import pdf_extractor, text_extractor, url_scrapper, transcripter
6
+ from ingestion_engine.chunker import chunk_text
7
+ from ingestion_engine.embedding_generator import generate
8
+ from persistence.vector_store import VectorStore
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ EXTRACTORS = {
13
+ "pdf": lambda fp, _url: pdf_extractor.extract(fp),
14
+ "txt": lambda fp, _url: text_extractor.extract(fp),
15
+ "pptx": lambda fp, _url: _extract_pptx(fp),
16
+ "url": lambda _fp, url: url_scrapper.extract(url),
17
+ "youtube": lambda _fp, url: transcripter.extract(url),
18
+ }
19
+
20
+
21
+ def _extract_pptx(file_path: str) -> str:
22
+ """Extract text from a PPTX file (lazy import to keep python-pptx optional)."""
23
+ from pptx import Presentation
24
+
25
+ prs = Presentation(file_path)
26
+ texts = []
27
+ for slide in prs.slides:
28
+ for shape in slide.shapes:
29
+ if shape.has_text_frame:
30
+ for paragraph in shape.text_frame.paragraphs:
31
+ text = paragraph.text.strip()
32
+ if text:
33
+ texts.append(text)
34
+ return "\n\n".join(texts)
35
+
36
+
37
+ class IngestionManager:
38
+ """Orchestrates: extract -> chunk -> embed -> store in Pinecone."""
39
+
40
+ def __init__(self):
41
+ self.vector_store = VectorStore()
42
+
43
+ def process_source(self, source, file_path: str | None, notebook_id: str) -> tuple[int, str | None]:
44
+ """
45
+ Run the full ingestion pipeline for a single source.
46
+
47
+ Args:
48
+ source: state.Source object
49
+ file_path: local file path (None for URL/YouTube sources)
50
+ notebook_id: used as Pinecone namespace
51
+
52
+ Returns:
53
+ (chunk_count, error_message) β€” error_message is None on success
54
+ """
55
+ try:
56
+ # Step 1: Extract text
57
+ extractor = EXTRACTORS.get(source.file_type)
58
+ if not extractor:
59
+ return 0, f"Unsupported file type: {source.file_type}"
60
+
61
+ raw_text = extractor(file_path, source.source_url)
62
+ if not raw_text or not raw_text.strip():
63
+ return 0, "No text could be extracted from this source."
64
+
65
+ logger.info("Extracted %d characters from %s", len(raw_text), source.filename)
66
+
67
+ # Step 2: Chunk
68
+ chunks = chunk_text(raw_text, chunk_size=500, chunk_overlap=50)
69
+ if not chunks:
70
+ return 0, "Text was extracted but produced no usable chunks."
71
+
72
+ logger.info("Created %d chunks from %s", len(chunks), source.filename)
73
+
74
+ # Step 3: Embed
75
+ chunk_texts = [c["text"] for c in chunks]
76
+ vectors = generate(chunk_texts)
77
+
78
+ logger.info("Generated %d embeddings for %s", len(vectors), source.filename)
79
+
80
+ # Step 4: Prepare records and upsert to Pinecone
81
+ records = []
82
+ for chunk, vector in zip(chunks, vectors):
83
+ records.append({
84
+ "id": f"{source.id}_{chunk['chunk_index']}",
85
+ "values": vector,
86
+ "metadata": {
87
+ "source_id": source.id,
88
+ "source_filename": source.filename,
89
+ "chunk_index": chunk["chunk_index"],
90
+ "text": chunk["text"][:1000],
91
+ },
92
+ })
93
+
94
+ self.vector_store.upsert(records, namespace=notebook_id)
95
+
96
+ logger.info("Stored %d vectors for %s in namespace %s", len(records), source.filename, notebook_id)
97
+ return len(chunks), None
98
+
99
+ except Exception as e:
100
+ logger.error("Ingestion failed for %s: %s", source.filename, e)
101
+ return 0, f"Ingestion error: {str(e)}"
ingestion_engine/pdf_extractor.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PDF text extraction using PyMuPDF."""
2
+
3
+ import fitz
4
+
5
+
6
+ def extract(file_path: str) -> str:
7
+ """Extract text from all pages of a PDF file."""
8
+ doc = fitz.open(file_path)
9
+ pages = []
10
+ for page in doc:
11
+ text = page.get_text()
12
+ if text.strip():
13
+ pages.append(text)
14
+ doc.close()
15
+ return "\n\n".join(pages)
ingestion_engine/text_extractor.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Plain text file extraction."""
2
+
3
+
4
+ def extract(file_path: str) -> str:
5
+ """Read a plain text file as UTF-8."""
6
+ with open(file_path, "r", encoding="utf-8", errors="replace") as f:
7
+ return f.read()
ingestion_engine/transcripter.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """YouTube video transcript extraction."""
2
+
3
+ from urllib.parse import urlparse, parse_qs
4
+ from youtube_transcript_api import YouTubeTranscriptApi
5
+
6
+
7
+ def extract(url: str) -> str:
8
+ """Fetch the transcript for a YouTube video."""
9
+ video_id = _parse_video_id(url)
10
+ if not video_id:
11
+ raise ValueError(f"Could not parse YouTube video ID from: {url}")
12
+
13
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
14
+ return " ".join(entry["text"] for entry in transcript_list)
15
+
16
+
17
+ def _parse_video_id(url: str) -> str | None:
18
+ """Extract video ID from youtube.com/watch?v=... or youtu.be/... URLs."""
19
+ parsed = urlparse(url)
20
+ hostname = parsed.hostname or ""
21
+ if "youtu.be" in hostname:
22
+ return parsed.path.lstrip("/")
23
+ if "youtube.com" in hostname:
24
+ return parse_qs(parsed.query).get("v", [None])[0]
25
+ return None
ingestion_engine/url_scrapper.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Web page text extraction via requests + BeautifulSoup."""
2
+
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+
6
+
7
+ def extract(url: str) -> str:
8
+ """Fetch a web page and extract its main text content."""
9
+ headers = {"User-Agent": "Mozilla/5.0 (NotebookLM Bot)"}
10
+ response = requests.get(url, headers=headers, timeout=15)
11
+ response.raise_for_status()
12
+
13
+ soup = BeautifulSoup(response.text, "html.parser")
14
+
15
+ # Remove non-content elements
16
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
17
+ tag.decompose()
18
+
19
+ text = soup.get_text(separator="\n", strip=True)
20
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
21
+ return "\n".join(lines)
pages/sources.py CHANGED
@@ -59,6 +59,14 @@ def render_source_list(state: UserData) -> str:
59
  meta_parts.append(f"{source.chunk_count} chunks")
60
  meta_str = " Β· ".join(meta_parts)
61
 
 
 
 
 
 
 
 
 
62
  html += (
63
  f'<div class="source-card">'
64
  f'<div class="source-icon {ft}">{cfg["icon"]}</div>'
@@ -66,7 +74,7 @@ def render_source_list(state: UserData) -> str:
66
  f'<div class="name">{source.filename}</div>'
67
  f'<div class="meta">{meta_str}</div>'
68
  f'</div>'
69
- f'<span class="source-badge ready">Ready</span>'
70
  f'</div>'
71
  )
72
  return html
@@ -100,13 +108,14 @@ def handle_file_upload(files, state: UserData) -> tuple[UserData, str, str, list
100
  if file_ext not in ALLOWED_TYPES:
101
  continue
102
 
103
- # Get file size
104
  try:
105
  import os
106
  file_path = f.name if hasattr(f, 'name') else str(f)
107
  size_bytes = os.path.getsize(file_path)
108
  size_mb = round(size_bytes / (1024 * 1024), 2)
109
  except Exception:
 
110
  size_mb = 0
111
 
112
  if size_mb > MAX_FILE_SIZE_MB:
@@ -119,9 +128,10 @@ def handle_file_upload(files, state: UserData) -> tuple[UserData, str, str, list
119
  size_mb=size_mb,
120
  source_url=None,
121
  chunk_count=0,
122
- status="ready",
123
  error_message=None,
124
  created_at=datetime.now().isoformat(),
 
125
  )
126
  nb.sources.append(source)
127
 
@@ -153,9 +163,10 @@ def handle_url_add(url: str, state: UserData) -> tuple[UserData, str, str, str,
153
  size_mb=None,
154
  source_url=url,
155
  chunk_count=0,
156
- status="ready",
157
  error_message=None,
158
  created_at=datetime.now().isoformat(),
 
159
  )
160
  nb.sources.append(source)
161
 
@@ -163,10 +174,47 @@ def handle_url_add(url: str, state: UserData) -> tuple[UserData, str, str, str,
163
 
164
 
165
  def handle_source_delete(source_name: str, state: UserData) -> tuple[UserData, str, str, list[str]]:
166
- """Delete a source by filename. Returns (state, source_list_html, header_html, source_choices)."""
167
  nb = get_active_notebook(state)
168
  if not nb or not source_name:
169
  return state, render_source_list(state), render_source_header(state), get_source_choices(state)
170
 
 
 
 
 
 
 
 
 
 
171
  nb.sources = [s for s in nb.sources if s.filename != source_name]
172
  return state, render_source_list(state), render_source_header(state), get_source_choices(state)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  meta_parts.append(f"{source.chunk_count} chunks")
60
  meta_str = " Β· ".join(meta_parts)
61
 
62
+ if source.status == "processing":
63
+ badge = '<span class="source-badge processing">Processing...</span>'
64
+ elif source.status == "failed":
65
+ err = source.error_message or "Unknown error"
66
+ badge = f'<span class="source-badge failed" title="{err}">Failed</span>'
67
+ else:
68
+ badge = '<span class="source-badge ready">Ready</span>'
69
+
70
  html += (
71
  f'<div class="source-card">'
72
  f'<div class="source-icon {ft}">{cfg["icon"]}</div>'
 
74
  f'<div class="name">{source.filename}</div>'
75
  f'<div class="meta">{meta_str}</div>'
76
  f'</div>'
77
+ f'{badge}'
78
  f'</div>'
79
  )
80
  return html
 
108
  if file_ext not in ALLOWED_TYPES:
109
  continue
110
 
111
+ # Get file size and path
112
  try:
113
  import os
114
  file_path = f.name if hasattr(f, 'name') else str(f)
115
  size_bytes = os.path.getsize(file_path)
116
  size_mb = round(size_bytes / (1024 * 1024), 2)
117
  except Exception:
118
+ file_path = None
119
  size_mb = 0
120
 
121
  if size_mb > MAX_FILE_SIZE_MB:
 
128
  size_mb=size_mb,
129
  source_url=None,
130
  chunk_count=0,
131
+ status="processing",
132
  error_message=None,
133
  created_at=datetime.now().isoformat(),
134
+ file_path=file_path,
135
  )
136
  nb.sources.append(source)
137
 
 
163
  size_mb=None,
164
  source_url=url,
165
  chunk_count=0,
166
+ status="processing",
167
  error_message=None,
168
  created_at=datetime.now().isoformat(),
169
+ file_path=None,
170
  )
171
  nb.sources.append(source)
172
 
 
174
 
175
 
176
  def handle_source_delete(source_name: str, state: UserData) -> tuple[UserData, str, str, list[str]]:
177
+ """Delete a source by filename and remove its vectors from Pinecone."""
178
  nb = get_active_notebook(state)
179
  if not nb or not source_name:
180
  return state, render_source_list(state), render_source_header(state), get_source_choices(state)
181
 
182
+ # Delete vectors from Pinecone before removing from state
183
+ source_to_delete = next((s for s in nb.sources if s.filename == source_name), None)
184
+ if source_to_delete:
185
+ try:
186
+ from persistence.vector_store import VectorStore
187
+ VectorStore().delete_by_source(source_to_delete.id, nb.id)
188
+ except Exception:
189
+ pass # Best-effort; source removed from UI regardless
190
+
191
  nb.sources = [s for s in nb.sources if s.filename != source_name]
192
  return state, render_source_list(state), render_source_header(state), get_source_choices(state)
193
+
194
+
195
+ def run_ingestion_pipeline(state: UserData) -> tuple[UserData, str, str, list[str]]:
196
+ """Process all sources with status='processing' through the ingestion engine."""
197
+ from ingestion_engine import IngestionManager
198
+
199
+ nb = get_active_notebook(state)
200
+ if not nb:
201
+ return state, render_source_list(state), render_source_header(state), get_source_choices(state)
202
+
203
+ manager = IngestionManager()
204
+
205
+ for source in nb.sources:
206
+ if source.status != "processing":
207
+ continue
208
+
209
+ chunk_count, error = manager.process_source(source, source.file_path, nb.id)
210
+
211
+ if error:
212
+ source.status = "failed"
213
+ source.error_message = error
214
+ source.chunk_count = 0
215
+ else:
216
+ source.status = "ready"
217
+ source.error_message = None
218
+ source.chunk_count = chunk_count
219
+
220
+ return state, render_source_list(state), render_source_header(state), get_source_choices(state)
persistence/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Persistence Layer β€” storage services."""
2
+
3
+ from persistence.vector_store import VectorStore
4
+
5
+ __all__ = ["VectorStore"]
persistence/vector_store.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pinecone vector store operations."""
2
+
3
+ import os
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ INDEX_NAME = "notebooklm"
9
+ UPSERT_BATCH_SIZE = 100
10
+
11
+
12
+ class VectorStore:
13
+ """Pinecone client for upserting, deleting, and querying vectors."""
14
+
15
+ def __init__(self):
16
+ self._index = None
17
+
18
+ def _get_index(self):
19
+ """Lazy-initialize the Pinecone index connection."""
20
+ if self._index is not None:
21
+ return self._index
22
+
23
+ from pinecone import Pinecone
24
+
25
+ api_key = os.environ.get("PINECONE_API_KEY")
26
+ if not api_key:
27
+ raise RuntimeError(
28
+ "PINECONE_API_KEY not found in environment. "
29
+ "Add it as a Secret in your HF Space settings."
30
+ )
31
+
32
+ pc = Pinecone(api_key=api_key)
33
+ self._index = pc.Index(INDEX_NAME)
34
+ logger.info("Connected to Pinecone index: %s", INDEX_NAME)
35
+ return self._index
36
+
37
+ def upsert(self, records: list[dict], namespace: str) -> int:
38
+ """
39
+ Upsert embedding records into Pinecone in batches.
40
+
41
+ Args:
42
+ records: List of {"id": str, "values": list[float], "metadata": dict}
43
+ namespace: Pinecone namespace (notebook_id)
44
+
45
+ Returns:
46
+ Number of vectors upserted
47
+ """
48
+ index = self._get_index()
49
+ total = 0
50
+ for i in range(0, len(records), UPSERT_BATCH_SIZE):
51
+ batch = records[i : i + UPSERT_BATCH_SIZE]
52
+ index.upsert(vectors=batch, namespace=namespace)
53
+ total += len(batch)
54
+ logger.info("Upserted %d vectors to namespace '%s'", total, namespace)
55
+ return total
56
+
57
+ def delete_by_source(self, source_id: str, namespace: str) -> None:
58
+ """Delete all vectors belonging to a specific source."""
59
+ try:
60
+ index = self._get_index()
61
+ index.delete(
62
+ filter={"source_id": {"$eq": source_id}},
63
+ namespace=namespace,
64
+ )
65
+ logger.info("Deleted vectors for source '%s' from namespace '%s'", source_id, namespace)
66
+ except Exception as e:
67
+ logger.error("Failed to delete vectors from Pinecone: %s", e)
68
+
69
+ def delete_namespace(self, namespace: str) -> None:
70
+ """Delete all vectors in a namespace (when a notebook is deleted)."""
71
+ try:
72
+ index = self._get_index()
73
+ index.delete(delete_all=True, namespace=namespace)
74
+ logger.info("Deleted entire namespace '%s'", namespace)
75
+ except Exception as e:
76
+ logger.error("Failed to delete namespace from Pinecone: %s", e)
77
+
78
+ def query(self, query_vector: list[float], namespace: str, top_k: int = 5) -> list[dict]:
79
+ """
80
+ Query Pinecone for the most similar chunks.
81
+
82
+ Returns list of {"text", "source_id", "source_filename", "chunk_index", "score"}.
83
+ """
84
+ try:
85
+ index = self._get_index()
86
+ results = index.query(
87
+ vector=query_vector,
88
+ namespace=namespace,
89
+ top_k=top_k,
90
+ include_metadata=True,
91
+ )
92
+
93
+ matches = []
94
+ for match in results.get("matches", []):
95
+ meta = match.get("metadata", {})
96
+ matches.append({
97
+ "text": meta.get("text", ""),
98
+ "source_id": meta.get("source_id", ""),
99
+ "source_filename": meta.get("source_filename", ""),
100
+ "chunk_index": meta.get("chunk_index", 0),
101
+ "score": match.get("score", 0.0),
102
+ })
103
+ return matches
104
+
105
+ except Exception as e:
106
+ logger.error("Pinecone query failed: %s", e)
107
+ return []
requirements.txt CHANGED
@@ -1 +1,8 @@
1
  gradio>=5.0.0
 
 
 
 
 
 
 
 
1
  gradio>=5.0.0
2
+ sentence-transformers>=2.2.0
3
+ pinecone-client>=3.0.0
4
+ PyMuPDF>=1.23.0
5
+ python-pptx>=0.6.21
6
+ beautifulsoup4>=4.12.0
7
+ requests>=2.31.0
8
+ youtube-transcript-api>=0.6.0
state.py CHANGED
@@ -5,7 +5,7 @@ import uuid
5
 
6
 
7
  class Source:
8
- def __init__(self, id, filename, file_type, size_mb, source_url, chunk_count, status, error_message, created_at):
9
  self.id = id
10
  self.filename = filename
11
  self.file_type = file_type # "pdf", "pptx", "txt", "url", "youtube"
@@ -15,6 +15,7 @@ class Source:
15
  self.status = status # "ready", "processing", "failed"
16
  self.error_message = error_message
17
  self.created_at = created_at
 
18
 
19
 
20
  class Message:
@@ -88,6 +89,12 @@ def create_notebook(state, title):
88
 
89
  def delete_notebook(state, nb_id):
90
  if nb_id in state.notebooks:
 
 
 
 
 
 
91
  del state.notebooks[nb_id]
92
  remaining = list(state.notebooks.keys())
93
  state.active_notebook_id = remaining[0] if remaining else None
 
5
 
6
 
7
  class Source:
8
+ def __init__(self, id, filename, file_type, size_mb, source_url, chunk_count, status, error_message, created_at, file_path=None):
9
  self.id = id
10
  self.filename = filename
11
  self.file_type = file_type # "pdf", "pptx", "txt", "url", "youtube"
 
15
  self.status = status # "ready", "processing", "failed"
16
  self.error_message = error_message
17
  self.created_at = created_at
18
+ self.file_path = file_path
19
 
20
 
21
  class Message:
 
89
 
90
  def delete_notebook(state, nb_id):
91
  if nb_id in state.notebooks:
92
+ # Clean up Pinecone vectors for this notebook
93
+ try:
94
+ from persistence.vector_store import VectorStore
95
+ VectorStore().delete_namespace(nb_id)
96
+ except Exception:
97
+ pass # Best-effort cleanup
98
  del state.notebooks[nb_id]
99
  remaining = list(state.notebooks.keys())
100
  state.active_notebook_id = remaining[0] if remaining else None
theme.py CHANGED
@@ -220,6 +220,15 @@ CUSTOM_CSS = """
220
  font-weight: 600; letter-spacing: 0.3px;
221
  }
222
  .source-badge.ready { background: rgba(34,197,94,0.15); color: #22c55e; }
 
 
 
 
 
 
 
 
 
223
 
224
  /* ── Welcome hero ── */
225
  .welcome-hero {
 
220
  font-weight: 600; letter-spacing: 0.3px;
221
  }
222
  .source-badge.ready { background: rgba(34,197,94,0.15); color: #22c55e; }
223
+ .source-badge.processing {
224
+ background: rgba(234,179,8,0.15); color: #eab308;
225
+ animation: pulse-badge 1.5s ease-in-out infinite;
226
+ }
227
+ .source-badge.failed { background: rgba(239,68,68,0.15); color: #ef4444; cursor: help; }
228
+ @keyframes pulse-badge {
229
+ 0%, 100% { opacity: 1; }
230
+ 50% { opacity: 0.5; }
231
+ }
232
 
233
  /* ── Welcome hero ── */
234
  .welcome-hero {