Spaces:

notebooklm-group2
/

NotebookLM-Clone

Running

App Files Files Community

dlokesha commited on Mar 2

Commit

2883e00

1 Parent(s): c092a08

feat: txt ingestion pipeline with chunking and embeddings

Browse files

Files changed (4) hide show

app.py +100 -0
backend/ingestion_txt.py +273 -0
db/schema.sql +18 -0
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from dotenv import load_dotenv
 load_dotenv(Path(__file__).resolve().parent.parent / ".env")
 load_dotenv(Path(__file__).resolve().parent / ".env")
 import gradio as gr
 from backend.notebook_service import create_notebook, list_notebooks, rename_notebook, delete_notebook
@@ -147,6 +148,70 @@ def _build_row_updates(notebooks):
         out.append(gr.update(value=name, visible=visible))
     return out
 with gr.Blocks(
     title="NotebookLM Clone - Notebooks",
@@ -229,4 +294,39 @@ with gr.Blocks(
             outputs=[selected_notebook_id],
         ).then(_on_select, None, [status])
 demo.launch()

 load_dotenv(Path(__file__).resolve().parent.parent / ".env")
 load_dotenv(Path(__file__).resolve().parent / ".env")
+from datetime import datetime
 import gradio as gr
 from backend.notebook_service import create_notebook, list_notebooks, rename_notebook, delete_notebook
         out.append(gr.update(value=name, visible=visible))
     return out
+# ── Upload Handler Functions ──────────────────────────────────
+def _do_upload(text_content, title, notebook_id, profile: gr.OAuthProfile | None):
+    """Handle direct text input and ingestion."""
+    from backend.ingestion_txt import ingest_txt, list_sources
+    user_id = _user_id(profile)
+    if not user_id:
+        return "❌ Please sign in first.", ""
+    if not notebook_id:
+        return "❌ Please select a notebook first.", ""
+    if not text_content or not text_content.strip():
+        return "❌ No text entered.", ""
+    try:
+        # Use title as filename, fallback to timestamp
+        filename = (title or "").strip()
+        if not filename:
+            filename = f"text_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        if not filename.endswith(".txt"):
+            filename = filename + ".txt"
+        # Convert text to bytes for ingestion pipeline
+        file_bytes = text_content.encode("utf-8")
+        result = ingest_txt(
+            file_bytes=file_bytes,
+            filename=filename,
+            notebook_id=notebook_id,
+            user_id=user_id
+        )
+        meta = result["metadata"]
+        status_msg = (
+            f"✅ **{result['filename']}** saved successfully!\n\n"
+            f"- Size: {meta['size_bytes'] / 1024:.1f} KB"
+        )
+        #sources = list_sources(notebook_id)
+        return status_msg, ""
+    except ValueError as e:
+        return f"❌ {str(e)}", ""
+    except Exception as e:
+        return f"❌ Unexpected error: {str(e)}", ""
+def _format_sources(sources: list[dict]) -> str:
+    if not sources:
+        return "No sources yet."
+    lines = ["| Filename | Type | Status | Words |",
+             "|----------|------|--------|-------|"]
+    for s in sources:
+        meta = s.get("metadata") or {}
+        words = meta.get("word_count", "—")
+        lines.append(f"| {s['filename']} | {s['file_type']} | {s['status']} | {words} |")
+    return "\n".join(lines)
+def _load_sources(notebook_id, profile: gr.OAuthProfile | None):
+    from backend.ingestion_txt import list_sources
+    if not notebook_id:
+        return ""
+    sources = list_sources(notebook_id)
+    return _format_sources(sources)
 with gr.Blocks(
     title="NotebookLM Clone - Notebooks",
             outputs=[selected_notebook_id],
         ).then(_on_select, None, [status])
+    # ── Text Input Section ────────────────────────────────────
+    gr.Markdown("---")
+    gr.Markdown("## Add Text")
+    gr.Markdown("Select a notebook above, then paste or type your text.")
+    with gr.Row():
+        txt_title = gr.Textbox(
+            label="Title",
+            placeholder="Give this text a name (e.g. 'Lecture Notes Week 1')",
+            scale=1,
+        )
+    txt_input = gr.Textbox(
+        label="Text Content",
+        placeholder="Paste or type your text here...",
+        lines=10,
+    )
+    submit_btn = gr.Button("Save & Process", variant="primary")
+    upload_status = gr.Markdown("", elem_classes=["status"])
+    sources_display = gr.Markdown("")
+    submit_btn.click(
+        _do_upload,
+        inputs=[txt_input, txt_title, selected_notebook_id],
+        outputs=[upload_status, sources_display],
+    )
+    selected_notebook_id.change(
+        _load_sources,
+        inputs=[selected_notebook_id],
+        outputs=[sources_display],
+    )
 demo.launch()

backend/ingestion_txt.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""
+Text file ingestion pipeline.
+Handles .txt upload → extract → clean → save to Supabase DB + Storage.
+"""
+import chardet
+import re
+from datetime import datetime
+from uuid import uuid4
+from backend.db import supabase
+from backend.storage import save_file, get_sources_path
+import os
+from sentence_transformers import SentenceTransformer
+# Load model once at module level (not on every call)
+_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# ── Constants ────────────────────────────────────────────────
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+# ── Text Processing ──────────────────────────────────────────
+def detect_encoding(file_bytes: bytes) -> str:
+    """
+    Detects encoding of raw bytes.
+    Falls back to utf-8 if confidence is low.
+    """
+    result = chardet.detect(file_bytes)
+    encoding = result.get("encoding") or "utf-8"
+    confidence = result.get("confidence") or 0
+    if confidence < 0.7:
+        return "utf-8"
+    return encoding
+def clean_text(text: str) -> str:
+    """
+    Cleans raw extracted text.
+    - Removes null bytes
+    - Removes control characters (keeps newlines + tabs)
+    - Normalizes excessive blank lines
+    - Strips leading/trailing whitespace
+    """
+    # Remove null bytes
+    text = text.replace("\x00", "")
+    # Remove control characters except \n and \t
+    text = "".join(
+        ch for ch in text
+        if ch == "\n" or ch == "\t" or ch >= " "
+    )
+    # Normalize 3+ blank lines → 2
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+# ── Supabase DB Operations ───────────────────────────────────
+def _create_source_record(
+    source_id: str,
+    notebook_id: str,
+    user_id: str,
+    filename: str,
+    storage_path: str
+) -> None:
+    """Insert a new source row with PENDING status."""
+    supabase.table("sources").insert({
+        "id": source_id,
+        "notebook_id": notebook_id,
+        "user_id": user_id,
+        "filename": filename,
+        "file_type": "txt",
+        "status": "PENDING",
+        "storage_path": storage_path,
+    }).execute()
+# ── Chunking ─────────────────────────────────────────────────
+def chunk_text(text: str, source_id: str, notebook_id: str, filename: str = "") -> list[dict]:
+    words = text.split()
+    chunk_size = 400
+    overlap = 40
+    chunks = []
+    i = 0
+    # Calculate total chunks upfront
+    total_chunks = max(1, (len(words) + chunk_size - overlap - 1) // (chunk_size - overlap))
+    while i < len(words):
+        chunk_words = words[i:i + chunk_size]
+        content = " ".join(chunk_words)
+        chunks.append({
+            "id": str(uuid4()),
+            "source_id": source_id,
+            "notebook_id": notebook_id,
+            "content": content,
+            "chunk_index": len(chunks),
+            "metadata": {
+                "word_count": len(chunk_words),
+                "file_name": filename,
+                "chunk_index": len(chunks),
+                "total_chunks": total_chunks,
+            }
+        })
+        i += chunk_size - overlap
+    return chunks
+# ── Embed + Store ─────────────────────────────────────────────
+def embed_and_store_chunks(chunks: list[dict]) -> None:
+    """
+    Embed chunks using sentence-transformers and store in pgvector.
+    """
+    if not chunks:
+        return
+    # Embed all chunks in one batch
+    texts = [c["content"] for c in chunks]
+    embeddings = _model.encode(texts, show_progress_bar=False)
+    # Build rows for Supabase insert
+    rows = []
+    for chunk, embedding in zip(chunks, embeddings):
+        rows.append({
+            "id": str(chunk["id"]),
+            "source_id": str(chunk["source_id"]),
+            "notebook_id": str(chunk["notebook_id"]),
+            "content": chunk["content"],
+            "embedding": embedding.tolist(),
+            "metadata": chunk["metadata"]
+        })
+    try:
+        supabase.table("chunks").insert(rows).execute()
+        print(f"✅ Inserted {len(rows)} chunks into pgvector")
+    except Exception as e:
+        print(f"❌ Failed to insert chunks: {e}")
+        raise
+def _update_source_ready(
+    source_id: str,
+    extracted_text: str,
+    metadata: dict
+) -> None:
+    """Mark source as READY with extracted text and metadata."""
+    supabase.table("sources").update({
+        "status": "READY",
+        "extracted_text": extracted_text,
+        "metadata": metadata,
+        "updated_at": datetime.utcnow().isoformat(),
+    }).eq("id", source_id).execute()
+def _update_source_failed(source_id: str, error: str) -> None:
+    """Mark source as FAILED with error message in metadata."""
+    supabase.table("sources").update({
+        "status": "FAILED",
+        "metadata": {"error": error},
+        "updated_at": datetime.utcnow().isoformat(),
+    }).eq("id", source_id).execute()
+# ── Main Ingestion Function ──────────────────────────────────
+def ingest_txt(
+    file_bytes: bytes,
+    filename: str,
+    notebook_id: str,
+    user_id: str
+) -> dict:
+    """
+    Full pipeline for a .txt file upload:
+    1. Validate size
+    2. Upload raw file to Supabase Storage
+    3. Create source record (PENDING)
+    4. Detect encoding + decode
+    5. Clean text
+    6. Update source record (READY)
+    7. Return result dict
+    Returns dict with source_id, filename, status, metadata.
+    Raises ValueError on validation errors.
+    """
+    # ── Validate ─────────────────────────────────────────────
+    if not file_bytes:
+        raise ValueError("Empty file — nothing to ingest.")
+    if len(file_bytes) > MAX_FILE_SIZE:
+        raise ValueError(f"File too large. Max size is 10MB.")
+    if not filename.lower().endswith(".txt"):
+        raise ValueError("Only .txt files are accepted here.")
+    # ── Generate IDs ─────────────────────────────────────────
+    source_id = str(uuid4())
+    # ── Upload raw file to Supabase Storage ──────────────────
+    sources_path = get_sources_path(user_id, notebook_id)
+    storage_path = f"{sources_path}/{source_id}_{filename}"
+    save_file(storage_path, file_bytes)
+    # ── Create DB record (PENDING) ───────────────────────────
+    _create_source_record(
+        source_id=source_id,
+        notebook_id=notebook_id,
+        user_id=user_id,
+        filename=filename,
+        storage_path=storage_path
+    )
+    # ── Extract + Clean ───────────────────────────────────────
+    try:
+        encoding = detect_encoding(file_bytes)
+        raw_text = file_bytes.decode(encoding, errors="replace")
+        cleaned_text = clean_text(raw_text)
+        if not cleaned_text:
+            raise ValueError("No text content found after cleaning.")
+        metadata = {
+            "encoding": encoding,
+            "char_count": len(cleaned_text),
+            "word_count": len(cleaned_text.split()),
+            "line_count": cleaned_text.count("\n") + 1,
+            "size_bytes": len(file_bytes),
+        }
+        # ── Update DB record (READY) ──────────────────────────
+        _update_source_ready(source_id, cleaned_text, metadata)
+        # ── Chunk + Embed + Store ─────────────────────────────
+        print(f"🔄 Starting chunking for {filename}...")
+        chunks = chunk_text(cleaned_text, source_id, notebook_id, filename=filename)
+        print(f"🔄 Created {len(chunks)} chunks, embedding now...")
+        embed_and_store_chunks(chunks)
+        return {
+            "source_id": source_id,
+            "filename": filename,
+            "status": "READY",
+            "metadata": metadata,
+            "extracted_text": cleaned_text,
+            "chunks_created": len(chunks),
+        }
+    except Exception as e:
+        print(f"❌ Ingestion failed: {e}")
+        _update_source_failed(source_id, str(e))
+        raise
+# ── List Sources for a Notebook ──────────────────────────────
+def list_sources(notebook_id: str) -> list[dict]:
+    """
+    Returns all sources for a notebook ordered by created_at.
+    """
+    result = supabase.table("sources")\
+        .select("id, filename, file_type, status, metadata, created_at")\
+        .eq("notebook_id", notebook_id)\
+        .order("created_at")\
+        .execute()
+    return result.data or []

db/schema.sql CHANGED Viewed

@@ -46,3 +46,21 @@ create table if not exists chunks (
 create index if not exists idx_chunks_notebook_id on chunks(notebook_id);
 -- Vector index (run after you have data; ivfflat requires rows):
 -- create index idx_chunks_embedding on chunks using ivfflat (embedding vector_cosine_ops) with (lists = 100);

 create index if not exists idx_chunks_notebook_id on chunks(notebook_id);
 -- Vector index (run after you have data; ivfflat requires rows):
 -- create index idx_chunks_embedding on chunks using ivfflat (embedding vector_cosine_ops) with (lists = 100);
+-- sources table (ingestion pipeline)
+create table if not exists sources (
+    id uuid primary key default gen_random_uuid(),
+    notebook_id uuid not null references notebooks(id) on delete cascade,
+    user_id text not null,
+    filename text not null,
+    file_type text not null,
+    status text not null default 'PENDING',
+    storage_path text,
+    extracted_text text,
+    metadata jsonb default '{}',
+    created_at timestamptz default now(),
+    updated_at timestamptz default now()
+);
+create index if not exists idx_sources_notebook_id on sources(notebook_id);
+create index if not exists idx_sources_user_id on sources(user_id);
+create index if not exists idx_sources_status on sources(status);

requirements.txt CHANGED Viewed

@@ -3,3 +3,5 @@ huggingface_hub==0.24.7
 supabase>=2.0.0
 python-dotenv>=1.0.0
 realtime==2.3.0

 supabase>=2.0.0
 python-dotenv>=1.0.0
 realtime==2.3.0
+chardet>=5.0.0
+sentence-transformers>=2.0.0