pkheria commited on
Commit
b5e0c74
·
0 Parent(s):

psuhing to git

Browse files
.env.example ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROJECT_NAME=KnowledgeHub
2
+ VERSION=1.0.0
3
+ API_V1_STR=/api/v1
4
+
5
+ QDRANT_URL=http://localhost:6333
6
+ QDRANT_API_KEY=
7
+ QDRANT_COLLECTION_NAME=knowledge_base
8
+
9
+ NEMOTRON_PARSE_MODEL=Qwen/Qwen2-VL-2B-Instruct
10
+ NEMOTRON_EMBED_MODEL=nvidia/llama-nemotron-colembed-vl-3b-v2
11
+ EMBEDDING_DEVICE=cpu
12
+ HF_TOKEN=
13
+ NVIDIA_API_KEY=
14
+ NVIDIA_CHAT_MODEL=nvidia/nvidia-nemotron-nano-9b-v2
15
+ NVIDIA_API_URL=https://integrate.api.nvidia.com/v1
16
+
17
+ MAX_UPLOAD_SIZE=10485760
18
+ CHUNK_SIZE=1100
19
+ CHUNK_OVERLAP=180
20
+ ZEROGPU_DURATION_SECONDS=180
21
+ CHAT_TEMPERATURE=0.6
22
+ CHAT_TOP_P=0.95
23
+ CHAT_MAX_TOKENS=2048
24
+ MIN_THINKING_TOKENS=1024
25
+ MAX_THINKING_TOKENS=2048
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .env
2
+ .venv/
3
+ __pycache__/
4
+ *.py[cod]
5
+ .pytest_cache/
6
+ .ruff_cache/
7
+ data/exports/
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # KnowledgeHub Ingestor
2
+
3
+ KnowledgeHub Ingestor is a modular Gradio app for loading knowledge from:
4
+
5
+ - YouTube links with public transcripts/captions
6
+ - arXiv links or IDs
7
+ - PDF documents
8
+
9
+ It extracts text, chunks it, embeds chunks locally with your embedding model, and uploads vectors into Qdrant for retrieval. The answer generation step uses NVIDIA's OpenAI-compatible chat API.
10
+
11
+ ## Setup
12
+
13
+ ```bash
14
+ python3 -m venv .venv
15
+ source .venv/bin/activate
16
+ pip install -r requirements.txt
17
+ cp .env.example .env
18
+ ```
19
+
20
+ Add `NVIDIA_API_KEY` to `.env` for chat completions. Start Qdrant locally or point `QDRANT_URL` to your hosted instance.
21
+
22
+ The default model split is:
23
+
24
+ - Local parsing model: `Qwen/Qwen2-VL-2B-Instruct`
25
+ - Local embedding model: `nvidia/llama-nemotron-colembed-vl-3b-v2`
26
+ - NVIDIA API chat model: `nvidia/nvidia-nemotron-nano-9b-v2`
27
+
28
+ ## Run
29
+
30
+ ```bash
31
+ python app.py
32
+ ```
33
+
34
+ Open the local Gradio URL printed in the terminal, usually `http://127.0.0.1:7860`.
35
+
36
+ The app binds to `0.0.0.0:7860`, which is suitable for Hugging Face Spaces and container deployments.
37
+
38
+ For Hugging Face ZeroGPU Spaces, set:
39
+
40
+ ```bash
41
+ ENABLE_ZEROGPU=true
42
+ EMBEDDING_DEVICE=cuda
43
+ ```
44
+
45
+ The Gradio ingest/search/answer callbacks are decorated with `spaces.GPU` when running on Spaces. Locally, the decorator becomes a no-op.
46
+
47
+ ## Project Structure
48
+
49
+ ```text
50
+ app/
51
+ core/ settings and shared models
52
+ extractors/ PDF, arXiv, and YouTube extraction
53
+ services/ chunking, embeddings, Qdrant, retrieval, ingestion orchestration
54
+ ui/ Gradio Blocks UI
55
+ utils/ source detection helpers
56
+ ```
57
+
58
+ YouTube extraction requires captions/transcripts to be available for the video. arXiv ingestion downloads the paper PDF and parses it with `pypdf`.
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import shutil
3
+ import sys
4
+ from pathlib import Path
5
+
6
+
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format="%(asctime)s %(levelname)s %(message)s",
10
+ )
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def _clear_app_bytecode() -> None:
15
+ sys.dont_write_bytecode = True
16
+ for cache_dir in Path(__file__).resolve().parent.joinpath("app").rglob("__pycache__"):
17
+ shutil.rmtree(cache_dir, ignore_errors=True)
18
+
19
+
20
+ if __name__ == "__main__":
21
+ logger.info("Preparing BuildSmall app")
22
+ _clear_app_bytecode()
23
+ logger.info("Loading Gradio UI")
24
+ from app.ui.gradio_app import serve
25
+
26
+ logger.info("Launching BuildSmall app")
27
+ serve()
28
+ logger.info("BuildSmall app stopped")
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """KnowledgeHub document ingestion package."""
app/core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Core application settings and data models."""
app/core/config.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from pydantic_settings import BaseSettings, SettingsConfigDict
4
+
5
+
6
+ class Settings(BaseSettings):
7
+ PROJECT_NAME: str = "KnowledgeHub"
8
+ VERSION: str = "1.0.0"
9
+ API_V1_STR: str = "/api/v1"
10
+
11
+ QDRANT_URL: str = "http://localhost:6333"
12
+ QDRANT_API_KEY: str = ""
13
+ QDRANT_COLLECTION_NAME: str = "knowledge_base"
14
+
15
+ NEMOTRON_PARSE_MODEL: str = "Qwen/Qwen2-VL-2B-Instruct"
16
+ NEMOTRON_EMBED_MODEL: str = "nvidia/llama-nemotron-colembed-vl-3b-v2"
17
+ EMBEDDING_DEVICE: str = "cpu"
18
+ HF_TOKEN: str = ""
19
+ NVIDIA_API_KEY: str = ""
20
+ NVIDIA_CHAT_MODEL: str = "nvidia/nvidia-nemotron-nano-9b-v2"
21
+ NVIDIA_API_URL: str = "https://integrate.api.nvidia.com/v1"
22
+
23
+ MAX_UPLOAD_SIZE: int = 10 * 1024 * 1024
24
+ CHUNK_SIZE: int = 1100
25
+ CHUNK_OVERLAP: int = 180
26
+ ZEROGPU_DURATION_SECONDS: int = 180
27
+ CHAT_TEMPERATURE: float = 0.6
28
+ CHAT_TOP_P: float = 0.95
29
+ CHAT_MAX_TOKENS: int = 2048
30
+ MIN_THINKING_TOKENS: int = 1024
31
+ MAX_THINKING_TOKENS: int = 2048
32
+
33
+ model_config = SettingsConfigDict(
34
+ env_file=Path(__file__).resolve().parents[2] / ".env",
35
+ env_file_encoding="utf-8",
36
+ extra="ignore",
37
+ )
38
+
39
+ def get_qdrant_url(self) -> str:
40
+ if self.QDRANT_URL.startswith("https://") and ":" not in self.QDRANT_URL[8:]:
41
+ return f"{self.QDRANT_URL}:443"
42
+ return self.QDRANT_URL
43
+
44
+
45
+ settings = Settings()
app/core/models.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+
7
+ class SourceType(str, Enum):
8
+ PDF = "pdf"
9
+ ARXIV = "arxiv"
10
+ YOUTUBE = "youtube"
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Document:
15
+ source_type: SourceType
16
+ title: str
17
+ text: str
18
+ source: str
19
+ metadata: dict[str, Any] = field(default_factory=dict)
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class Chunk:
24
+ id: str
25
+ text: str
26
+ index: int
27
+ source_type: SourceType
28
+ source: str
29
+ title: str
30
+ metadata: dict[str, Any] = field(default_factory=dict)
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class IngestionResult:
35
+ document: Document
36
+ chunks: list[Chunk]
37
+ collection_name: str
38
+ export_path: Path
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class SearchResult:
43
+ score: float
44
+ text: str
45
+ title: str
46
+ source: str
47
+ source_type: str
48
+ metadata: dict[str, Any]
app/extractors/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Source-specific document extractors."""
app/extractors/arxiv.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ from pathlib import Path
3
+
4
+ import arxiv
5
+ import requests
6
+
7
+ from app.core.models import Document, SourceType
8
+ from app.extractors.pdf import extract_pdf
9
+ from app.utils.source_detection import extract_arxiv_id
10
+
11
+
12
+ def extract_arxiv(value: str) -> Document:
13
+ paper_id = extract_arxiv_id(value)
14
+ client = arxiv.Client()
15
+ search = arxiv.Search(id_list=[paper_id])
16
+ paper = next(client.results(search), None)
17
+ if paper is None:
18
+ raise ValueError(f"No arXiv paper found for {paper_id}.")
19
+
20
+ with tempfile.TemporaryDirectory(prefix="knowledgehub_arxiv_") as tmpdir:
21
+ pdf_url = paper.pdf_url or f"https://arxiv.org/pdf/{paper_id}.pdf"
22
+ pdf_path = Path(tmpdir) / f"{paper_id}.pdf"
23
+ response = requests.get(pdf_url, timeout=60)
24
+ response.raise_for_status()
25
+ pdf_path.write_bytes(response.content)
26
+
27
+ document = extract_pdf(
28
+ pdf_path,
29
+ title=paper.title,
30
+ metadata={
31
+ "arxiv_id": paper_id,
32
+ "authors": [str(author) for author in paper.authors],
33
+ "published": paper.published.isoformat() if paper.published else None,
34
+ "summary": paper.summary,
35
+ "pdf_url": pdf_url,
36
+ "entry_id": paper.entry_id,
37
+ },
38
+ )
39
+
40
+ return Document(
41
+ source_type=SourceType.ARXIV,
42
+ title=document.title,
43
+ text=document.text,
44
+ source=paper.entry_id,
45
+ metadata=document.metadata,
46
+ )
app/extractors/pdf.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from pypdf import PdfReader
4
+
5
+ from app.core.models import Document, SourceType
6
+
7
+
8
+ def extract_pdf(path: str | Path, title: str | None = None, metadata: dict | None = None) -> Document:
9
+ pdf_path = Path(path)
10
+ if not pdf_path.exists():
11
+ raise FileNotFoundError(f"PDF not found: {pdf_path}")
12
+
13
+ reader = PdfReader(str(pdf_path))
14
+ pages: list[str] = []
15
+ for page_number, page in enumerate(reader.pages, start=1):
16
+ text = page.extract_text() or ""
17
+ if text.strip():
18
+ pages.append(f"\n\n[Page {page_number}]\n{text.strip()}")
19
+
20
+ combined_text = "\n".join(pages).strip()
21
+ if not combined_text:
22
+ raise ValueError("No selectable text was found in this PDF.")
23
+
24
+ return Document(
25
+ source_type=SourceType.PDF,
26
+ title=title or pdf_path.stem,
27
+ text=combined_text,
28
+ source=str(pdf_path),
29
+ metadata={"pages": len(reader.pages), **(metadata or {})},
30
+ )
app/extractors/youtube.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.parse import parse_qs, urlparse
2
+
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+
5
+ from app.core.models import Document, SourceType
6
+
7
+
8
+ def _extract_video_id(url: str) -> str:
9
+ parsed = urlparse(url.strip())
10
+ if parsed.netloc.endswith("youtu.be"):
11
+ return parsed.path.strip("/")
12
+ if "youtube.com" in parsed.netloc:
13
+ query = parse_qs(parsed.query)
14
+ if "v" in query:
15
+ return query["v"][0]
16
+ if parsed.path.startswith("/shorts/"):
17
+ return parsed.path.split("/")[2]
18
+ raise ValueError("Could not find a YouTube video ID in the URL.")
19
+
20
+
21
+ def extract_youtube(url: str) -> Document:
22
+ video_id = _extract_video_id(url)
23
+ api = YouTubeTranscriptApi()
24
+ if hasattr(api, "fetch"):
25
+ transcript = api.fetch(video_id)
26
+ transcript_items = transcript.to_raw_data()
27
+ else:
28
+ transcript_items = YouTubeTranscriptApi.get_transcript(video_id)
29
+
30
+ if not transcript_items:
31
+ raise ValueError("No transcript was available for this YouTube video.")
32
+
33
+ lines = []
34
+ for item in transcript_items:
35
+ timestamp = int(item.get("start", 0))
36
+ minutes, seconds = divmod(timestamp, 60)
37
+ text = item.get("text", "").strip()
38
+ if text:
39
+ lines.append(f"[{minutes:02d}:{seconds:02d}] {text}")
40
+
41
+ return Document(
42
+ source_type=SourceType.YOUTUBE,
43
+ title=f"YouTube Transcript {video_id}",
44
+ text="\n".join(lines).strip(),
45
+ source=url,
46
+ metadata={"video_id": video_id, "segments": len(transcript_items)},
47
+ )
app/services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Application service layer."""
app/services/chat.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ from openai import OpenAI
4
+
5
+ from app.core.config import settings
6
+ from app.core.models import SearchResult
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class ChatAnswer:
11
+ answer: str
12
+ reasoning: str | None
13
+ context: list[SearchResult]
14
+
15
+
16
+ class NvidiaChatClient:
17
+ def __init__(self):
18
+ if not settings.NVIDIA_API_KEY:
19
+ raise ValueError("NVIDIA_API_KEY is required for NVIDIA chat completions.")
20
+
21
+ self.client = OpenAI(
22
+ base_url=settings.NVIDIA_API_URL,
23
+ api_key=settings.NVIDIA_API_KEY,
24
+ )
25
+
26
+ def answer_with_context(self, question: str, context: list[SearchResult]) -> ChatAnswer:
27
+ context_text = "\n\n".join(
28
+ [
29
+ (
30
+ f"[{index}] title={item.title}\n"
31
+ f"source={item.source}\n"
32
+ f"score={item.score:.4f}\n"
33
+ f"text={item.text}"
34
+ )
35
+ for index, item in enumerate(context, start=1)
36
+ ]
37
+ )
38
+ messages = [
39
+ {
40
+ "role": "system",
41
+ "content": (
42
+ "You are KnowledgeHub's retrieval assistant. Answer only from the "
43
+ "provided context. If the context is insufficient, say what is missing. "
44
+ "Cite sources using bracket numbers like [1], [2]."
45
+ ),
46
+ },
47
+ {
48
+ "role": "user",
49
+ "content": f"Question:\n{question}\n\nRetrieved context:\n{context_text}",
50
+ },
51
+ ]
52
+ completion = self.client.chat.completions.create(
53
+ model=settings.NVIDIA_CHAT_MODEL,
54
+ messages=messages,
55
+ temperature=settings.CHAT_TEMPERATURE,
56
+ top_p=settings.CHAT_TOP_P,
57
+ max_tokens=settings.CHAT_MAX_TOKENS,
58
+ frequency_penalty=0,
59
+ presence_penalty=0,
60
+ stream=False,
61
+ extra_body={
62
+ "min_thinking_tokens": settings.MIN_THINKING_TOKENS,
63
+ "max_thinking_tokens": settings.MAX_THINKING_TOKENS,
64
+ },
65
+ )
66
+ message = completion.choices[0].message
67
+ reasoning = getattr(message, "reasoning_content", None)
68
+ return ChatAnswer(answer=message.content or "", reasoning=reasoning, context=context)
app/services/chunking.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import uuid
3
+
4
+ from app.core.models import Chunk, Document
5
+
6
+
7
+ def chunk_document(document: Document, chunk_size: int, overlap: int) -> list[Chunk]:
8
+ if overlap >= chunk_size:
9
+ raise ValueError("Chunk overlap must be smaller than chunk size.")
10
+
11
+ normalized = re.sub(r"\n{3,}", "\n\n", document.text).strip()
12
+ if not normalized:
13
+ raise ValueError("Document is empty after extraction.")
14
+
15
+ chunks: list[Chunk] = []
16
+ start = 0
17
+ index = 0
18
+ while start < len(normalized):
19
+ end = min(start + chunk_size, len(normalized))
20
+ if end < len(normalized):
21
+ paragraph_break = normalized.rfind("\n\n", start, end)
22
+ sentence_break = normalized.rfind(". ", start, end)
23
+ best_break = max(paragraph_break, sentence_break)
24
+ if best_break > start + chunk_size // 2:
25
+ end = best_break + 1
26
+
27
+ text = normalized[start:end].strip()
28
+ if text:
29
+ digest = str(uuid.uuid5(uuid.NAMESPACE_URL, f"{document.source}:{index}:{text[:80]}"))
30
+ chunks.append(
31
+ Chunk(
32
+ id=digest,
33
+ text=text,
34
+ index=index,
35
+ source_type=document.source_type,
36
+ source=document.source,
37
+ title=document.title,
38
+ metadata=document.metadata,
39
+ )
40
+ )
41
+ index += 1
42
+
43
+ if end == len(normalized):
44
+ break
45
+ start = max(0, end - overlap)
46
+
47
+ return chunks
app/services/embeddings.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import cached_property, lru_cache
2
+
3
+ from app.core.config import settings
4
+ from app.utils.zerogpu import is_enabled as zerogpu_is_enabled
5
+
6
+
7
+ class LocalEmbeddingClient:
8
+ def __init__(self, model: str | None = None, device: str | None = None):
9
+ self.model_name = model or settings.NEMOTRON_EMBED_MODEL
10
+ self.device = device or _resolve_device()
11
+
12
+ @cached_property
13
+ def model(self):
14
+ try:
15
+ from sentence_transformers import SentenceTransformer
16
+ except ImportError as exc:
17
+ raise ImportError(
18
+ "sentence-transformers is required for local embeddings. "
19
+ "Install dependencies with `pip install -r requirements.txt`."
20
+ ) from exc
21
+
22
+ return SentenceTransformer(
23
+ self.model_name,
24
+ device=self.device,
25
+ token=settings.HF_TOKEN or None,
26
+ trust_remote_code=True,
27
+ )
28
+
29
+ @cached_property
30
+ def native_model(self):
31
+ try:
32
+ from transformers import AutoModel
33
+ except ImportError as exc:
34
+ raise ImportError(
35
+ "transformers is required for native local embeddings. "
36
+ "Install dependencies with `pip install -r requirements.txt`."
37
+ ) from exc
38
+
39
+ model = AutoModel.from_pretrained(
40
+ self.model_name,
41
+ token=settings.HF_TOKEN or None,
42
+ trust_remote_code=True,
43
+ dtype="auto" if self.device != "cpu" else None,
44
+ )
45
+ if self.device:
46
+ model = model.to(self.device)
47
+ return model.eval()
48
+
49
+ def embed_texts(self, texts: list[str]) -> list[list[float]]:
50
+ if not texts:
51
+ return []
52
+
53
+ try:
54
+ embeddings = self.model.encode(
55
+ texts,
56
+ batch_size=8,
57
+ normalize_embeddings=True,
58
+ show_progress_bar=False,
59
+ )
60
+ return embeddings.tolist()
61
+ except ValueError as exc:
62
+ if "Modality 'text' is not supported" not in str(exc):
63
+ raise
64
+
65
+ embeddings = self._embed_with_native_query_encoder(texts)
66
+ return embeddings.tolist()
67
+
68
+ def _embed_with_native_query_encoder(self, texts: list[str]):
69
+ try:
70
+ import torch
71
+ import torch.nn.functional as F
72
+ except ImportError as exc:
73
+ raise ImportError(
74
+ "torch is required for the native Nemotron embedding path. "
75
+ "Install dependencies with `pip install -r requirements.txt`."
76
+ ) from exc
77
+
78
+ if not hasattr(self.native_model, "forward_queries"):
79
+ raise ValueError(
80
+ f"{self.model_name} does not support SentenceTransformer text encoding "
81
+ "or a native forward_queries API."
82
+ )
83
+
84
+ with torch.no_grad():
85
+ output = self.native_model.forward_queries(texts, batch_size=4)
86
+
87
+ if isinstance(output, (list, tuple)):
88
+ output = output[0]
89
+
90
+ if not torch.is_tensor(output):
91
+ output = torch.as_tensor(output)
92
+
93
+ if output.ndim == 3:
94
+ output = output.float().mean(dim=1)
95
+ elif output.ndim != 2:
96
+ raise ValueError(f"Unexpected embedding shape from {self.model_name}: {tuple(output.shape)}")
97
+
98
+ return F.normalize(output.float(), p=2, dim=1).cpu()
99
+
100
+
101
+ @lru_cache(maxsize=1)
102
+ def get_embedding_client() -> LocalEmbeddingClient:
103
+ return LocalEmbeddingClient()
104
+
105
+
106
+ def _resolve_device() -> str:
107
+ if zerogpu_is_enabled() and settings.EMBEDDING_DEVICE == "cpu":
108
+ return "cuda"
109
+ return settings.EMBEDDING_DEVICE
app/services/ingestion.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from app.core.config import settings
4
+ from app.core.models import Document, IngestionResult, SourceType
5
+ from app.extractors.arxiv import extract_arxiv
6
+ from app.extractors.pdf import extract_pdf
7
+ from app.extractors.youtube import extract_youtube
8
+ from app.services.chat import NvidiaChatClient
9
+ from app.services.chunking import chunk_document
10
+ from app.services.embeddings import get_embedding_client
11
+ from app.services.vector_store import QdrantVectorStore
12
+ from app.utils.source_detection import detect_source
13
+
14
+
15
+ EXPORT_DIR = Path("data/exports")
16
+
17
+
18
+ def extract_document(url: str | None = None, pdf_path: str | None = None) -> Document:
19
+ source_type = detect_source(url, pdf_path)
20
+ if source_type == SourceType.PDF:
21
+ return extract_pdf(str(pdf_path))
22
+ if source_type == SourceType.ARXIV:
23
+ return extract_arxiv(str(url))
24
+ if source_type == SourceType.YOUTUBE:
25
+ return extract_youtube(str(url))
26
+ raise ValueError(f"Unsupported source type: {source_type}")
27
+
28
+
29
+ def save_markdown(document: Document, chunks_count: int) -> Path:
30
+ EXPORT_DIR.mkdir(parents=True, exist_ok=True)
31
+ safe_title = "".join(char if char.isalnum() or char in "-_" else "_" for char in document.title)[:80]
32
+ path = EXPORT_DIR / f"{safe_title or document.source_type.value}.md"
33
+ metadata_lines = "\n".join(f"- {key}: {value}" for key, value in document.metadata.items())
34
+ path.write_text(
35
+ "\n".join(
36
+ [
37
+ f"# {document.title}",
38
+ "",
39
+ f"- Source type: {document.source_type.value}",
40
+ f"- Source: {document.source}",
41
+ f"- Chunks uploaded: {chunks_count}",
42
+ metadata_lines,
43
+ "",
44
+ "## Extracted Text",
45
+ "",
46
+ document.text,
47
+ ]
48
+ ),
49
+ encoding="utf-8",
50
+ )
51
+ return path
52
+
53
+
54
+ def ingest_source(
55
+ url: str | None,
56
+ pdf_path: str | None,
57
+ chunk_size: int | None = None,
58
+ chunk_overlap: int | None = None,
59
+ collection_name: str | None = None,
60
+ ) -> IngestionResult:
61
+ document = extract_document(url=url, pdf_path=pdf_path)
62
+ chunks = chunk_document(
63
+ document,
64
+ chunk_size=chunk_size or settings.CHUNK_SIZE,
65
+ overlap=chunk_overlap or settings.CHUNK_OVERLAP,
66
+ )
67
+ embeddings = get_embedding_client().embed_texts([chunk.text for chunk in chunks])
68
+ store = QdrantVectorStore(collection_name=collection_name)
69
+ store.upsert_chunks(chunks, embeddings)
70
+ export_path = save_markdown(document, len(chunks))
71
+ return IngestionResult(
72
+ document=document,
73
+ chunks=chunks,
74
+ collection_name=store.collection_name,
75
+ export_path=export_path,
76
+ )
77
+
78
+
79
+ def search_knowledge_base(query: str, limit: int = 5, collection_name: str | None = None):
80
+ query_text = query.strip()
81
+ if not query_text:
82
+ raise ValueError("Enter a query to search.")
83
+ embedding = get_embedding_client().embed_texts([query_text])[0]
84
+ return QdrantVectorStore(collection_name=collection_name).search(embedding, limit=limit)
85
+
86
+
87
+ def answer_from_knowledge_base(query: str, limit: int = 5, collection_name: str | None = None):
88
+ results = search_knowledge_base(query, limit=limit, collection_name=collection_name)
89
+ return NvidiaChatClient().answer_with_context(query, results)
app/services/vector_store.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qdrant_client import QdrantClient
2
+ from qdrant_client.http.models import Distance, PointStruct, VectorParams
3
+
4
+ from app.core.config import settings
5
+ from app.core.models import Chunk, SearchResult
6
+
7
+
8
+ class QdrantVectorStore:
9
+ def __init__(self, collection_name: str | None = None):
10
+ self.collection_name = collection_name or settings.QDRANT_COLLECTION_NAME
11
+ self.client = QdrantClient(
12
+ url=settings.get_qdrant_url(),
13
+ api_key=settings.QDRANT_API_KEY or None,
14
+ timeout=60,
15
+ )
16
+
17
+ def ensure_collection(self, vector_size: int) -> None:
18
+ collections = self.client.get_collections().collections
19
+ exists = any(collection.name == self.collection_name for collection in collections)
20
+ if not exists:
21
+ self.client.create_collection(
22
+ collection_name=self.collection_name,
23
+ vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
24
+ )
25
+
26
+ def upsert_chunks(self, chunks: list[Chunk], embeddings: list[list[float]]) -> None:
27
+ if len(chunks) != len(embeddings):
28
+ raise ValueError("Chunks and embeddings must have the same length.")
29
+ if not chunks:
30
+ return
31
+
32
+ self.ensure_collection(vector_size=len(embeddings[0]))
33
+ points = [
34
+ PointStruct(
35
+ id=chunk.id,
36
+ vector=embedding,
37
+ payload={
38
+ "text": chunk.text,
39
+ "chunk_index": chunk.index,
40
+ "source_type": chunk.source_type.value,
41
+ "source": chunk.source,
42
+ "title": chunk.title,
43
+ "metadata": chunk.metadata,
44
+ },
45
+ )
46
+ for chunk, embedding in zip(chunks, embeddings, strict=True)
47
+ ]
48
+ self.client.upsert(collection_name=self.collection_name, points=points)
49
+
50
+ def search(self, query_embedding: list[float], limit: int = 5) -> list[SearchResult]:
51
+ if hasattr(self.client, "query_points"):
52
+ response = self.client.query_points(
53
+ collection_name=self.collection_name,
54
+ query=query_embedding,
55
+ limit=limit,
56
+ with_payload=True,
57
+ )
58
+ hits = response.points
59
+ else:
60
+ hits = self.client.search(
61
+ collection_name=self.collection_name,
62
+ query_vector=query_embedding,
63
+ limit=limit,
64
+ with_payload=True,
65
+ )
66
+
67
+ results: list[SearchResult] = []
68
+ for hit in hits:
69
+ payload = hit.payload or {}
70
+ results.append(
71
+ SearchResult(
72
+ score=float(hit.score),
73
+ text=str(payload.get("text", "")),
74
+ title=str(payload.get("title", "")),
75
+ source=str(payload.get("source", "")),
76
+ source_type=str(payload.get("source_type", "")),
77
+ metadata=dict(payload.get("metadata", {})),
78
+ )
79
+ )
80
+ return results
app/ui/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """UI package."""
app/ui/gradio_app.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+
4
+ import gradio as gr
5
+
6
+ from app.core.config import settings
7
+ from app.ui.theme import CSS, HEAD, JS
8
+ from app.utils.zerogpu import gpu
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ THEME = gr.themes.Base(
13
+ primary_hue="cyan",
14
+ secondary_hue="lime",
15
+ neutral_hue="slate",
16
+ radius_size="sm",
17
+ font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
18
+ font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "ui-monospace", "monospace"],
19
+ )
20
+
21
+
22
+ def _format_metadata(metadata: dict) -> str:
23
+ if not metadata:
24
+ return "No metadata found."
25
+ rows = []
26
+ for key, value in metadata.items():
27
+ rows.append(f"**{key}**: {value}")
28
+ return "\n\n".join(rows)
29
+
30
+
31
+ @gpu()
32
+ def _ingest(url: str, pdf_file: str | None, chunk_size: int, chunk_overlap: int, collection_name: str):
33
+ logger.info(
34
+ "Ingest requested url=%s pdf_file=%s chunk_size=%s chunk_overlap=%s collection=%s",
35
+ url,
36
+ pdf_file,
37
+ chunk_size,
38
+ chunk_overlap,
39
+ collection_name,
40
+ )
41
+ try:
42
+ from app.services.ingestion import ingest_source
43
+
44
+ result = ingest_source(
45
+ url=url,
46
+ pdf_path=pdf_file,
47
+ chunk_size=chunk_size,
48
+ chunk_overlap=chunk_overlap,
49
+ collection_name=collection_name,
50
+ )
51
+ document = result.document
52
+ status = (
53
+ f"### Ingestion complete\n\n"
54
+ f"Uploaded **{len(result.chunks)} chunks** into Qdrant collection "
55
+ f"`{result.collection_name}`.\n\n"
56
+ f"Saved extracted text to `{result.export_path}`."
57
+ )
58
+ preview = document.text[:12000]
59
+ if len(document.text) > len(preview):
60
+ preview += "\n\n[Preview truncated in UI. Full text is saved in the export file.]"
61
+ return (
62
+ status,
63
+ document.title,
64
+ document.source_type.value,
65
+ str(len(document.text)),
66
+ str(len(result.chunks)),
67
+ _format_metadata(document.metadata),
68
+ preview,
69
+ str(result.export_path),
70
+ )
71
+ except Exception as exc:
72
+ return (
73
+ f"### Ingestion failed\n\n`{type(exc).__name__}: {exc}`\n\n```text\n{traceback.format_exc(limit=2)}\n```",
74
+ "",
75
+ "",
76
+ "0",
77
+ "0",
78
+ "",
79
+ "",
80
+ "",
81
+ )
82
+
83
+
84
+ @gpu()
85
+ def _search(query: str, limit: int, collection_name: str):
86
+ logger.info("Search requested query=%s limit=%s collection=%s", query, limit, collection_name)
87
+ try:
88
+ from app.services.ingestion import search_knowledge_base
89
+
90
+ results = search_knowledge_base(query, limit=limit, collection_name=collection_name)
91
+ except Exception as exc:
92
+ if "MPS backend out of memory" in str(exc):
93
+ return (
94
+ "### Search failed\n\n"
95
+ "The local embedding model ran out of Apple GPU memory. "
96
+ "Restart the app so the new CPU embedding setting takes effect. "
97
+ "Keep `EMBEDDING_DEVICE=cpu` in `.env`."
98
+ )
99
+ return f"### Search failed\n\n`{type(exc).__name__}: {exc}`"
100
+
101
+ if not results:
102
+ return "No matches found."
103
+
104
+ blocks = []
105
+ for index, result in enumerate(results, start=1):
106
+ excerpt = result.text[:1200]
107
+ blocks.append(
108
+ "\n".join(
109
+ [
110
+ f"### {index}. {result.title}",
111
+ f"**Score:** {result.score:.4f}",
112
+ f"**Source:** {result.source_type} | {result.source}",
113
+ "",
114
+ excerpt,
115
+ ]
116
+ )
117
+ )
118
+ return "\n\n---\n\n".join(blocks)
119
+
120
+
121
+ @gpu()
122
+ def _answer(query: str, limit: int, collection_name: str):
123
+ logger.info("Answer requested query=%s limit=%s collection=%s", query, limit, collection_name)
124
+ try:
125
+ from app.services.ingestion import answer_from_knowledge_base
126
+
127
+ result = answer_from_knowledge_base(query, limit=limit, collection_name=collection_name)
128
+ except Exception as exc:
129
+ if "MPS backend out of memory" in str(exc):
130
+ return (
131
+ "### Answer failed\n\n"
132
+ "The local embedding model ran out of Apple GPU memory. "
133
+ "Restart the app so the new CPU embedding setting takes effect. "
134
+ "Keep `EMBEDDING_DEVICE=cpu` in `.env`.",
135
+ "",
136
+ "",
137
+ )
138
+ return f"### Answer failed\n\n`{type(exc).__name__}: {exc}`", "", ""
139
+
140
+ context_blocks = []
141
+ for index, item in enumerate(result.context, start=1):
142
+ context_blocks.append(
143
+ "\n".join(
144
+ [
145
+ f"### [{index}] {item.title}",
146
+ f"**Score:** {item.score:.4f}",
147
+ f"**Source:** {item.source_type} | {item.source}",
148
+ "",
149
+ item.text[:1000],
150
+ ]
151
+ )
152
+ )
153
+
154
+ reasoning = result.reasoning or "No reasoning content was returned by the API."
155
+ return result.answer, reasoning, "\n\n---\n\n".join(context_blocks)
156
+
157
+
158
+ def build_app() -> gr.Blocks:
159
+ with gr.Blocks(
160
+ title=f"{settings.PROJECT_NAME} Ingestor",
161
+ ) as demo:
162
+ with gr.Column(elem_id="kh-shell"):
163
+ gr.Markdown(
164
+ f"""
165
+ # {settings.PROJECT_NAME}
166
+ Turn papers, PDFs, and videos into a searchable vector memory.
167
+
168
+ Extract text, chunk it cleanly, embed locally, and use NVIDIA chat for grounded answers.
169
+ """,
170
+ elem_id="kh-title",
171
+ )
172
+ gr.HTML(
173
+ f"""
174
+ <div class="kh-chip-row">
175
+ <div class="kh-chip">Embeddings <code>{settings.NEMOTRON_EMBED_MODEL}</code></div>
176
+ <div class="kh-chip">Parser <code>{settings.NEMOTRON_PARSE_MODEL}</code></div>
177
+ <div class="kh-chip">Chat <code>{settings.NVIDIA_CHAT_MODEL}</code></div>
178
+ <div class="kh-chip">Collection <code>{settings.QDRANT_COLLECTION_NAME}</code></div>
179
+ <div class="kh-chip">Sources PDF · arXiv · YouTube</div>
180
+ </div>
181
+ """,
182
+ )
183
+
184
+ with gr.Tabs():
185
+ with gr.Tab("Ingest"):
186
+ with gr.Row(equal_height=True):
187
+ with gr.Column(scale=5, elem_classes=["kh-panel"]):
188
+ gr.Markdown(
189
+ "### Source Intake\n<div class='kh-subhead'>Upload a PDF or paste one link. The pipeline handles extraction, chunking, local embeddings, and Qdrant upload.</div>"
190
+ )
191
+ source_url = gr.Textbox(
192
+ label="YouTube or arXiv input",
193
+ placeholder="Paste a YouTube URL, arXiv URL, or arXiv ID",
194
+ lines=2,
195
+ )
196
+ pdf_file = gr.File(
197
+ label="PDF document",
198
+ file_types=[".pdf"],
199
+ type="filepath",
200
+ )
201
+ with gr.Row():
202
+ chunk_size = gr.Slider(
203
+ 400,
204
+ 2500,
205
+ value=settings.CHUNK_SIZE,
206
+ step=50,
207
+ label="Chunk size",
208
+ )
209
+ chunk_overlap = gr.Slider(
210
+ 0,
211
+ 600,
212
+ value=settings.CHUNK_OVERLAP,
213
+ step=25,
214
+ label="Chunk overlap",
215
+ )
216
+ collection_name_ingest = gr.Textbox(
217
+ label="Collection Name",
218
+ value=settings.QDRANT_COLLECTION_NAME,
219
+ placeholder="Enter Qdrant collection name",
220
+ )
221
+ ingest_btn = gr.Button("Ingest into Qdrant", variant="primary")
222
+
223
+ with gr.Column(scale=4, elem_classes=["kh-panel"]):
224
+ gr.Markdown("### Pipeline Status")
225
+ status = gr.Markdown(elem_id="kh-status")
226
+ with gr.Row():
227
+ title = gr.Textbox(
228
+ label="Title",
229
+ interactive=False,
230
+ elem_classes=["kh-stat"],
231
+ )
232
+ source_type = gr.Textbox(
233
+ label="Type",
234
+ interactive=False,
235
+ elem_classes=["kh-stat"],
236
+ )
237
+ with gr.Row():
238
+ char_count = gr.Textbox(
239
+ label="Characters",
240
+ interactive=False,
241
+ elem_classes=["kh-stat"],
242
+ )
243
+ chunk_count = gr.Textbox(
244
+ label="Chunks",
245
+ interactive=False,
246
+ elem_classes=["kh-stat"],
247
+ )
248
+ export_path = gr.Textbox(label="Export file", interactive=False)
249
+
250
+ with gr.Row(equal_height=True):
251
+ metadata = gr.Markdown(label="Metadata", elem_classes=["kh-panel"])
252
+ text_preview = gr.Textbox(
253
+ label="Extracted text preview",
254
+ lines=18,
255
+ interactive=False,
256
+ elem_id="kh-text-preview",
257
+ elem_classes=["kh-panel"],
258
+ )
259
+
260
+ ingest_btn.click(
261
+ fn=_ingest,
262
+ inputs=[source_url, pdf_file, chunk_size, chunk_overlap, collection_name_ingest],
263
+ outputs=[
264
+ status,
265
+ title,
266
+ source_type,
267
+ char_count,
268
+ chunk_count,
269
+ metadata,
270
+ text_preview,
271
+ export_path,
272
+ ],
273
+ )
274
+
275
+ with gr.Tab("Retrieve"):
276
+ with gr.Row(equal_height=True):
277
+ with gr.Column(scale=3, elem_classes=["kh-panel"]):
278
+ gr.Markdown(
279
+ "### Retrieval Probe\n<div class='kh-subhead'>Run a quick similarity search against the same Qdrant collection after ingestion.</div>"
280
+ )
281
+ query = gr.Textbox(
282
+ label="Search query",
283
+ placeholder="Ask a question or enter keywords",
284
+ lines=4,
285
+ )
286
+ limit = gr.Slider(1, 10, value=5, step=1, label="Results")
287
+ collection_name_retrieve = gr.Textbox(
288
+ label="Collection Name",
289
+ value=settings.QDRANT_COLLECTION_NAME,
290
+ placeholder="Enter Qdrant collection name",
291
+ )
292
+ with gr.Row():
293
+ search_btn = gr.Button("Search Qdrant", variant="secondary")
294
+ answer_btn = gr.Button("Answer with NVIDIA", variant="primary")
295
+ with gr.Column(scale=5, elem_classes=["kh-panel"]):
296
+ gr.Markdown("### Answer")
297
+ answer_output = gr.Markdown(elem_id="kh-answer")
298
+
299
+ with gr.Row(equal_height=True):
300
+ with gr.Column(elem_classes=["kh-panel"]):
301
+ gr.Markdown("### Matches")
302
+ search_results = gr.Markdown(elem_id="kh-search-results")
303
+ with gr.Column(elem_classes=["kh-panel"]):
304
+ gr.Markdown("### Reasoning")
305
+ reasoning_output = gr.Markdown(elem_id="kh-reasoning")
306
+
307
+ search_btn.click(
308
+ fn=_search,
309
+ inputs=[query, limit, collection_name_retrieve],
310
+ outputs=search_results,
311
+ )
312
+ answer_btn.click(
313
+ fn=_answer,
314
+ inputs=[query, limit, collection_name_retrieve],
315
+ outputs=[answer_output, reasoning_output, search_results],
316
+ )
317
+
318
+ # Sync collection names across tabs
319
+ collection_name_ingest.change(
320
+ fn=lambda x: x, inputs=[collection_name_ingest], outputs=[collection_name_retrieve]
321
+ )
322
+ collection_name_retrieve.change(
323
+ fn=lambda x: x, inputs=[collection_name_retrieve], outputs=[collection_name_ingest]
324
+ )
325
+
326
+ return demo
327
+
328
+
329
+ def serve() -> None:
330
+ logger.info("Building Gradio app")
331
+ demo = build_app()
332
+ logger.info("Launching Gradio server on 0.0.0.0:7860")
333
+ demo.queue().launch(
334
+ server_name="0.0.0.0",
335
+ server_port=7860,
336
+ show_error=True,
337
+ theme=THEME,
338
+ css=CSS,
339
+ js=JS,
340
+ head=HEAD,
341
+ )
app/ui/theme.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ HEAD = """
2
+ <link rel="preconnect" href="https://fonts.googleapis.com">
3
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
4
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet">
5
+ """
6
+
7
+ JS = """
8
+ () => {
9
+ const root = document.querySelector('.gradio-container');
10
+ if (!root) return;
11
+ root.dataset.ready = 'true';
12
+ const marker = document.createElement('div');
13
+ marker.className = 'kh-scanline';
14
+ root.prepend(marker);
15
+ }
16
+ """
17
+
18
+ CSS = """
19
+ :root {
20
+ --kh-bg: #080b0f;
21
+ --kh-surface: rgba(18, 24, 32, 0.78);
22
+ --kh-surface-strong: rgba(27, 36, 48, 0.92);
23
+ --kh-ink: #f7fbff;
24
+ --kh-muted: #a7b4c2;
25
+ --kh-soft: #d8e1ea;
26
+ --kh-line: rgba(255, 255, 255, 0.12);
27
+ --kh-cyan: #20d6c7;
28
+ --kh-lime: #b8f45d;
29
+ --kh-rose: #ff6b8a;
30
+ --kh-amber: #ffcf5c;
31
+ --kh-shadow: rgba(0, 0, 0, 0.32);
32
+ }
33
+
34
+ .gradio-container {
35
+ min-height: 100vh;
36
+ background:
37
+ radial-gradient(circle at 18% 8%, rgba(32, 214, 199, 0.22), transparent 30%),
38
+ radial-gradient(circle at 86% 12%, rgba(255, 207, 92, 0.16), transparent 28%),
39
+ linear-gradient(135deg, #080b0f 0%, #101720 48%, #0b1017 100%) !important;
40
+ color: var(--kh-ink);
41
+ font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
42
+ }
43
+
44
+ .kh-scanline {
45
+ position: fixed;
46
+ inset: 0;
47
+ pointer-events: none;
48
+ background-image: linear-gradient(rgba(255,255,255,0.035) 1px, transparent 1px);
49
+ background-size: 100% 4px;
50
+ mask-image: linear-gradient(to bottom, transparent, black 18%, black 72%, transparent);
51
+ opacity: 0.18;
52
+ z-index: 0;
53
+ }
54
+
55
+ #kh-shell {
56
+ position: relative;
57
+ z-index: 1;
58
+ max-width: 1220px;
59
+ margin: 0 auto;
60
+ padding: 28px 18px 42px;
61
+ }
62
+
63
+ #kh-title {
64
+ padding: 34px 0 22px;
65
+ border-bottom: 1px solid var(--kh-line);
66
+ }
67
+
68
+ #kh-title h1 {
69
+ max-width: 920px;
70
+ color: var(--kh-ink);
71
+ font-size: clamp(2.6rem, 6vw, 6rem);
72
+ font-weight: 800;
73
+ line-height: 0.9;
74
+ margin: 0 0 14px;
75
+ letter-spacing: 0;
76
+ }
77
+
78
+ #kh-title p {
79
+ max-width: 780px;
80
+ color: var(--kh-muted);
81
+ font-size: 1.04rem;
82
+ line-height: 1.65;
83
+ }
84
+
85
+ #kh-title code,
86
+ .kh-chip code {
87
+ color: var(--kh-lime);
88
+ background: rgba(184, 244, 93, 0.09);
89
+ border: 1px solid rgba(184, 244, 93, 0.18);
90
+ border-radius: 6px;
91
+ padding: 2px 6px;
92
+ font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, monospace;
93
+ }
94
+
95
+ .kh-panel {
96
+ border: 1px solid var(--kh-line);
97
+ border-radius: 8px;
98
+ background: linear-gradient(180deg, var(--kh-surface-strong), var(--kh-surface));
99
+ box-shadow: 0 24px 70px var(--kh-shadow);
100
+ backdrop-filter: blur(18px);
101
+ padding: 18px !important;
102
+ }
103
+
104
+ .kh-panel label,
105
+ .kh-panel .label-wrap span {
106
+ color: var(--kh-soft) !important;
107
+ font-weight: 700 !important;
108
+ }
109
+
110
+ .kh-subhead {
111
+ margin: 8px 0 16px;
112
+ color: var(--kh-muted);
113
+ font-size: 0.95rem;
114
+ }
115
+
116
+ .kh-chip-row {
117
+ display: flex;
118
+ flex-wrap: wrap;
119
+ gap: 10px;
120
+ margin-top: 18px;
121
+ }
122
+
123
+ .kh-chip {
124
+ border: 1px solid var(--kh-line);
125
+ border-radius: 999px;
126
+ padding: 8px 12px;
127
+ color: var(--kh-soft);
128
+ background: rgba(255, 255, 255, 0.055);
129
+ font-size: 0.9rem;
130
+ }
131
+
132
+ .kh-stat {
133
+ min-height: 92px;
134
+ border: 1px solid var(--kh-line);
135
+ border-radius: 8px;
136
+ padding: 14px 16px;
137
+ background: rgba(255, 255, 255, 0.055);
138
+ }
139
+
140
+ .kh-stat .wrap,
141
+ .kh-stat input {
142
+ background: transparent !important;
143
+ }
144
+
145
+ .tabs {
146
+ margin-top: 20px;
147
+ }
148
+
149
+ .tab-nav button {
150
+ color: var(--kh-muted) !important;
151
+ border-radius: 8px !important;
152
+ font-weight: 700 !important;
153
+ }
154
+
155
+ .tab-nav button.selected {
156
+ color: var(--kh-ink) !important;
157
+ background: linear-gradient(135deg, rgba(32, 214, 199, 0.22), rgba(184, 244, 93, 0.12)) !important;
158
+ border: 1px solid rgba(32, 214, 199, 0.34) !important;
159
+ }
160
+
161
+ textarea,
162
+ input {
163
+ color: var(--kh-ink) !important;
164
+ background: rgba(3, 7, 12, 0.52) !important;
165
+ border-color: rgba(255, 255, 255, 0.12) !important;
166
+ font-size: 0.96rem !important;
167
+ }
168
+
169
+ textarea::placeholder,
170
+ input::placeholder {
171
+ color: rgba(216, 225, 234, 0.46) !important;
172
+ }
173
+
174
+ #kh-status {
175
+ min-height: 130px;
176
+ }
177
+
178
+ #kh-status h3 {
179
+ color: var(--kh-lime);
180
+ margin-top: 0;
181
+ }
182
+
183
+ #kh-text-preview textarea {
184
+ min-height: 430px !important;
185
+ line-height: 1.6 !important;
186
+ font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, monospace !important;
187
+ font-size: 0.9rem !important;
188
+ }
189
+
190
+ #kh-search-results {
191
+ min-height: 410px;
192
+ }
193
+
194
+ #kh-answer,
195
+ #kh-reasoning {
196
+ min-height: 240px;
197
+ }
198
+
199
+ #kh-answer {
200
+ font-size: 1.02rem;
201
+ line-height: 1.7;
202
+ }
203
+
204
+ #kh-search-results h3 {
205
+ color: var(--kh-cyan);
206
+ }
207
+
208
+ #kh-reasoning {
209
+ color: var(--kh-muted);
210
+ font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, monospace;
211
+ font-size: 0.86rem;
212
+ line-height: 1.6;
213
+ }
214
+
215
+ .prose,
216
+ .markdown {
217
+ color: var(--kh-soft) !important;
218
+ }
219
+
220
+ .prose strong,
221
+ .markdown strong {
222
+ color: var(--kh-ink) !important;
223
+ }
224
+
225
+ button.primary {
226
+ min-height: 46px;
227
+ background: linear-gradient(135deg, var(--kh-cyan), var(--kh-lime)) !important;
228
+ color: #061015 !important;
229
+ border: 0 !important;
230
+ border-radius: 8px !important;
231
+ font-weight: 800 !important;
232
+ box-shadow: 0 16px 34px rgba(32, 214, 199, 0.2);
233
+ }
234
+
235
+ button.secondary {
236
+ border-radius: 8px !important;
237
+ }
238
+
239
+ .file-preview,
240
+ .upload-container {
241
+ border-color: rgba(32, 214, 199, 0.26) !important;
242
+ background: rgba(32, 214, 199, 0.055) !important;
243
+ }
244
+
245
+ @media (max-width: 760px) {
246
+ #kh-shell {
247
+ padding: 18px 10px 32px;
248
+ }
249
+
250
+ #kh-title h1 {
251
+ font-size: clamp(2.25rem, 15vw, 4.2rem);
252
+ }
253
+
254
+ .kh-panel {
255
+ padding: 14px !important;
256
+ }
257
+ }
258
+ """
app/utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Utility helpers."""
app/utils/source_detection.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from pathlib import Path
3
+ from urllib.parse import urlparse
4
+
5
+ from app.core.models import SourceType
6
+
7
+
8
+ ARXIV_RE = re.compile(r"(?:arxiv\.org/(?:abs|pdf)/)?(?P<id>\d{4}\.\d{4,5})(?:v\d+)?", re.I)
9
+ YOUTUBE_HOSTS = {"youtube.com", "www.youtube.com", "m.youtube.com", "youtu.be", "www.youtu.be"}
10
+
11
+
12
+ def detect_source(url: str | None, pdf_path: str | None) -> SourceType:
13
+ if pdf_path:
14
+ suffix = Path(pdf_path).suffix.lower()
15
+ if suffix == ".pdf":
16
+ return SourceType.PDF
17
+ raise ValueError("Uploaded file must be a PDF.")
18
+
19
+ if not url or not url.strip():
20
+ raise ValueError("Provide a YouTube link, arXiv link/ID, or upload a PDF.")
21
+
22
+ clean_url = url.strip()
23
+ parsed = urlparse(clean_url)
24
+ host = parsed.netloc.lower()
25
+
26
+ if host in YOUTUBE_HOSTS:
27
+ return SourceType.YOUTUBE
28
+ if "arxiv.org" in host or ARXIV_RE.search(clean_url):
29
+ return SourceType.ARXIV
30
+ raise ValueError("Could not detect source type. Use a YouTube URL, arXiv URL/ID, or PDF.")
31
+
32
+
33
+ def extract_arxiv_id(value: str) -> str:
34
+ match = ARXIV_RE.search(value.strip())
35
+ if not match:
36
+ raise ValueError("Could not find a valid arXiv ID.")
37
+ return match.group("id")
app/utils/zerogpu.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from collections.abc import Callable
3
+ from typing import TypeVar
4
+
5
+ from app.core.config import settings
6
+
7
+
8
+ F = TypeVar("F", bound=Callable)
9
+
10
+
11
+ def gpu(duration: int | None = None) -> Callable[[F], F]:
12
+ if not _should_use_zerogpu():
13
+ return _identity
14
+
15
+ try:
16
+ import spaces
17
+ except ImportError:
18
+ return _identity
19
+
20
+ return spaces.GPU(duration=duration or settings.ZEROGPU_DURATION_SECONDS)
21
+
22
+
23
+ def _identity(func: F) -> F:
24
+ return func
25
+
26
+
27
+ def _should_use_zerogpu() -> bool:
28
+ if os.getenv("DISABLE_ZEROGPU", "").lower() in {"1", "true", "yes"}:
29
+ return False
30
+ if os.getenv("ENABLE_ZEROGPU", "").lower() in {"1", "true", "yes"}:
31
+ return True
32
+ return bool(os.getenv("SPACE_ID"))
33
+
34
+
35
+ def is_enabled() -> bool:
36
+ return _should_use_zerogpu()
pyproject.toml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "knowledgehub-ingestor"
3
+ version = "1.0.0"
4
+ description = "A Gradio document ingestion UI for PDFs, arXiv papers, and YouTube transcripts."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "arxiv>=2.1.3",
9
+ "datasets>=5.0.0",
10
+ "gradio>=5.0.0",
11
+ "openai>=1.99.0",
12
+ "pydantic-settings>=2.4.0",
13
+ "pypdf>=4.3.1",
14
+ "python-dotenv>=1.0.1",
15
+ "qdrant-client>=1.12.1",
16
+ "requests>=2.32.3",
17
+ "sentence-transformers>=3.0.1",
18
+ "spaces",
19
+ "torchvision>=0.27.0",
20
+ "youtube-transcript-api>=0.6.2",
21
+ ]
22
+
23
+ [project.optional-dependencies]
24
+ dev = [
25
+ "ruff>=0.6.0",
26
+ "pytest>=8.3.2",
27
+ ]
28
+
29
+ [tool.ruff]
30
+ line-length = 100
31
+ target-version = "py310"
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ arxiv>=2.1.3
2
+ gradio>=5.0.0
3
+ openai>=1.99.0
4
+ pydantic-settings>=2.4.0
5
+ pypdf>=4.3.1
6
+ python-dotenv>=1.0.1
7
+ qdrant-client>=1.12.1
8
+ requests>=2.32.3
9
+ sentence-transformers>=3.0.1
10
+ spaces
11
+ youtube-transcript-api>=0.6.2
uv.lock ADDED
The diff for this file is too large to render. See raw diff