Spaces:
Running on Zero
Running on Zero
Commit ·
b5e0c74
0
Parent(s):
psuhing to git
Browse files- .env.example +25 -0
- .gitignore +7 -0
- README.md +58 -0
- app.py +28 -0
- app/__init__.py +1 -0
- app/core/__init__.py +1 -0
- app/core/config.py +45 -0
- app/core/models.py +48 -0
- app/extractors/__init__.py +1 -0
- app/extractors/arxiv.py +46 -0
- app/extractors/pdf.py +30 -0
- app/extractors/youtube.py +47 -0
- app/services/__init__.py +1 -0
- app/services/chat.py +68 -0
- app/services/chunking.py +47 -0
- app/services/embeddings.py +109 -0
- app/services/ingestion.py +89 -0
- app/services/vector_store.py +80 -0
- app/ui/__init__.py +1 -0
- app/ui/gradio_app.py +341 -0
- app/ui/theme.py +258 -0
- app/utils/__init__.py +1 -0
- app/utils/source_detection.py +37 -0
- app/utils/zerogpu.py +36 -0
- pyproject.toml +31 -0
- requirements.txt +11 -0
- uv.lock +0 -0
.env.example
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PROJECT_NAME=KnowledgeHub
|
| 2 |
+
VERSION=1.0.0
|
| 3 |
+
API_V1_STR=/api/v1
|
| 4 |
+
|
| 5 |
+
QDRANT_URL=http://localhost:6333
|
| 6 |
+
QDRANT_API_KEY=
|
| 7 |
+
QDRANT_COLLECTION_NAME=knowledge_base
|
| 8 |
+
|
| 9 |
+
NEMOTRON_PARSE_MODEL=Qwen/Qwen2-VL-2B-Instruct
|
| 10 |
+
NEMOTRON_EMBED_MODEL=nvidia/llama-nemotron-colembed-vl-3b-v2
|
| 11 |
+
EMBEDDING_DEVICE=cpu
|
| 12 |
+
HF_TOKEN=
|
| 13 |
+
NVIDIA_API_KEY=
|
| 14 |
+
NVIDIA_CHAT_MODEL=nvidia/nvidia-nemotron-nano-9b-v2
|
| 15 |
+
NVIDIA_API_URL=https://integrate.api.nvidia.com/v1
|
| 16 |
+
|
| 17 |
+
MAX_UPLOAD_SIZE=10485760
|
| 18 |
+
CHUNK_SIZE=1100
|
| 19 |
+
CHUNK_OVERLAP=180
|
| 20 |
+
ZEROGPU_DURATION_SECONDS=180
|
| 21 |
+
CHAT_TEMPERATURE=0.6
|
| 22 |
+
CHAT_TOP_P=0.95
|
| 23 |
+
CHAT_MAX_TOKENS=2048
|
| 24 |
+
MIN_THINKING_TOKENS=1024
|
| 25 |
+
MAX_THINKING_TOKENS=2048
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
.venv/
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.py[cod]
|
| 5 |
+
.pytest_cache/
|
| 6 |
+
.ruff_cache/
|
| 7 |
+
data/exports/
|
README.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# KnowledgeHub Ingestor
|
| 2 |
+
|
| 3 |
+
KnowledgeHub Ingestor is a modular Gradio app for loading knowledge from:
|
| 4 |
+
|
| 5 |
+
- YouTube links with public transcripts/captions
|
| 6 |
+
- arXiv links or IDs
|
| 7 |
+
- PDF documents
|
| 8 |
+
|
| 9 |
+
It extracts text, chunks it, embeds chunks locally with your embedding model, and uploads vectors into Qdrant for retrieval. The answer generation step uses NVIDIA's OpenAI-compatible chat API.
|
| 10 |
+
|
| 11 |
+
## Setup
|
| 12 |
+
|
| 13 |
+
```bash
|
| 14 |
+
python3 -m venv .venv
|
| 15 |
+
source .venv/bin/activate
|
| 16 |
+
pip install -r requirements.txt
|
| 17 |
+
cp .env.example .env
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
Add `NVIDIA_API_KEY` to `.env` for chat completions. Start Qdrant locally or point `QDRANT_URL` to your hosted instance.
|
| 21 |
+
|
| 22 |
+
The default model split is:
|
| 23 |
+
|
| 24 |
+
- Local parsing model: `Qwen/Qwen2-VL-2B-Instruct`
|
| 25 |
+
- Local embedding model: `nvidia/llama-nemotron-colembed-vl-3b-v2`
|
| 26 |
+
- NVIDIA API chat model: `nvidia/nvidia-nemotron-nano-9b-v2`
|
| 27 |
+
|
| 28 |
+
## Run
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
python app.py
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
Open the local Gradio URL printed in the terminal, usually `http://127.0.0.1:7860`.
|
| 35 |
+
|
| 36 |
+
The app binds to `0.0.0.0:7860`, which is suitable for Hugging Face Spaces and container deployments.
|
| 37 |
+
|
| 38 |
+
For Hugging Face ZeroGPU Spaces, set:
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
ENABLE_ZEROGPU=true
|
| 42 |
+
EMBEDDING_DEVICE=cuda
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
The Gradio ingest/search/answer callbacks are decorated with `spaces.GPU` when running on Spaces. Locally, the decorator becomes a no-op.
|
| 46 |
+
|
| 47 |
+
## Project Structure
|
| 48 |
+
|
| 49 |
+
```text
|
| 50 |
+
app/
|
| 51 |
+
core/ settings and shared models
|
| 52 |
+
extractors/ PDF, arXiv, and YouTube extraction
|
| 53 |
+
services/ chunking, embeddings, Qdrant, retrieval, ingestion orchestration
|
| 54 |
+
ui/ Gradio Blocks UI
|
| 55 |
+
utils/ source detection helpers
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
YouTube extraction requires captions/transcripts to be available for the video. arXiv ingestion downloads the paper PDF and parses it with `pypdf`.
|
app.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import shutil
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
logging.basicConfig(
|
| 8 |
+
level=logging.INFO,
|
| 9 |
+
format="%(asctime)s %(levelname)s %(message)s",
|
| 10 |
+
)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _clear_app_bytecode() -> None:
|
| 15 |
+
sys.dont_write_bytecode = True
|
| 16 |
+
for cache_dir in Path(__file__).resolve().parent.joinpath("app").rglob("__pycache__"):
|
| 17 |
+
shutil.rmtree(cache_dir, ignore_errors=True)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
logger.info("Preparing BuildSmall app")
|
| 22 |
+
_clear_app_bytecode()
|
| 23 |
+
logger.info("Loading Gradio UI")
|
| 24 |
+
from app.ui.gradio_app import serve
|
| 25 |
+
|
| 26 |
+
logger.info("Launching BuildSmall app")
|
| 27 |
+
serve()
|
| 28 |
+
logger.info("BuildSmall app stopped")
|
app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""KnowledgeHub document ingestion package."""
|
app/core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Core application settings and data models."""
|
app/core/config.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Settings(BaseSettings):
|
| 7 |
+
PROJECT_NAME: str = "KnowledgeHub"
|
| 8 |
+
VERSION: str = "1.0.0"
|
| 9 |
+
API_V1_STR: str = "/api/v1"
|
| 10 |
+
|
| 11 |
+
QDRANT_URL: str = "http://localhost:6333"
|
| 12 |
+
QDRANT_API_KEY: str = ""
|
| 13 |
+
QDRANT_COLLECTION_NAME: str = "knowledge_base"
|
| 14 |
+
|
| 15 |
+
NEMOTRON_PARSE_MODEL: str = "Qwen/Qwen2-VL-2B-Instruct"
|
| 16 |
+
NEMOTRON_EMBED_MODEL: str = "nvidia/llama-nemotron-colembed-vl-3b-v2"
|
| 17 |
+
EMBEDDING_DEVICE: str = "cpu"
|
| 18 |
+
HF_TOKEN: str = ""
|
| 19 |
+
NVIDIA_API_KEY: str = ""
|
| 20 |
+
NVIDIA_CHAT_MODEL: str = "nvidia/nvidia-nemotron-nano-9b-v2"
|
| 21 |
+
NVIDIA_API_URL: str = "https://integrate.api.nvidia.com/v1"
|
| 22 |
+
|
| 23 |
+
MAX_UPLOAD_SIZE: int = 10 * 1024 * 1024
|
| 24 |
+
CHUNK_SIZE: int = 1100
|
| 25 |
+
CHUNK_OVERLAP: int = 180
|
| 26 |
+
ZEROGPU_DURATION_SECONDS: int = 180
|
| 27 |
+
CHAT_TEMPERATURE: float = 0.6
|
| 28 |
+
CHAT_TOP_P: float = 0.95
|
| 29 |
+
CHAT_MAX_TOKENS: int = 2048
|
| 30 |
+
MIN_THINKING_TOKENS: int = 1024
|
| 31 |
+
MAX_THINKING_TOKENS: int = 2048
|
| 32 |
+
|
| 33 |
+
model_config = SettingsConfigDict(
|
| 34 |
+
env_file=Path(__file__).resolve().parents[2] / ".env",
|
| 35 |
+
env_file_encoding="utf-8",
|
| 36 |
+
extra="ignore",
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
def get_qdrant_url(self) -> str:
|
| 40 |
+
if self.QDRANT_URL.startswith("https://") and ":" not in self.QDRANT_URL[8:]:
|
| 41 |
+
return f"{self.QDRANT_URL}:443"
|
| 42 |
+
return self.QDRANT_URL
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
settings = Settings()
|
app/core/models.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
from enum import Enum
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SourceType(str, Enum):
|
| 8 |
+
PDF = "pdf"
|
| 9 |
+
ARXIV = "arxiv"
|
| 10 |
+
YOUTUBE = "youtube"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass(frozen=True)
|
| 14 |
+
class Document:
|
| 15 |
+
source_type: SourceType
|
| 16 |
+
title: str
|
| 17 |
+
text: str
|
| 18 |
+
source: str
|
| 19 |
+
metadata: dict[str, Any] = field(default_factory=dict)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass(frozen=True)
|
| 23 |
+
class Chunk:
|
| 24 |
+
id: str
|
| 25 |
+
text: str
|
| 26 |
+
index: int
|
| 27 |
+
source_type: SourceType
|
| 28 |
+
source: str
|
| 29 |
+
title: str
|
| 30 |
+
metadata: dict[str, Any] = field(default_factory=dict)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass(frozen=True)
|
| 34 |
+
class IngestionResult:
|
| 35 |
+
document: Document
|
| 36 |
+
chunks: list[Chunk]
|
| 37 |
+
collection_name: str
|
| 38 |
+
export_path: Path
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@dataclass(frozen=True)
|
| 42 |
+
class SearchResult:
|
| 43 |
+
score: float
|
| 44 |
+
text: str
|
| 45 |
+
title: str
|
| 46 |
+
source: str
|
| 47 |
+
source_type: str
|
| 48 |
+
metadata: dict[str, Any]
|
app/extractors/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Source-specific document extractors."""
|
app/extractors/arxiv.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import arxiv
|
| 5 |
+
import requests
|
| 6 |
+
|
| 7 |
+
from app.core.models import Document, SourceType
|
| 8 |
+
from app.extractors.pdf import extract_pdf
|
| 9 |
+
from app.utils.source_detection import extract_arxiv_id
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def extract_arxiv(value: str) -> Document:
|
| 13 |
+
paper_id = extract_arxiv_id(value)
|
| 14 |
+
client = arxiv.Client()
|
| 15 |
+
search = arxiv.Search(id_list=[paper_id])
|
| 16 |
+
paper = next(client.results(search), None)
|
| 17 |
+
if paper is None:
|
| 18 |
+
raise ValueError(f"No arXiv paper found for {paper_id}.")
|
| 19 |
+
|
| 20 |
+
with tempfile.TemporaryDirectory(prefix="knowledgehub_arxiv_") as tmpdir:
|
| 21 |
+
pdf_url = paper.pdf_url or f"https://arxiv.org/pdf/{paper_id}.pdf"
|
| 22 |
+
pdf_path = Path(tmpdir) / f"{paper_id}.pdf"
|
| 23 |
+
response = requests.get(pdf_url, timeout=60)
|
| 24 |
+
response.raise_for_status()
|
| 25 |
+
pdf_path.write_bytes(response.content)
|
| 26 |
+
|
| 27 |
+
document = extract_pdf(
|
| 28 |
+
pdf_path,
|
| 29 |
+
title=paper.title,
|
| 30 |
+
metadata={
|
| 31 |
+
"arxiv_id": paper_id,
|
| 32 |
+
"authors": [str(author) for author in paper.authors],
|
| 33 |
+
"published": paper.published.isoformat() if paper.published else None,
|
| 34 |
+
"summary": paper.summary,
|
| 35 |
+
"pdf_url": pdf_url,
|
| 36 |
+
"entry_id": paper.entry_id,
|
| 37 |
+
},
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
return Document(
|
| 41 |
+
source_type=SourceType.ARXIV,
|
| 42 |
+
title=document.title,
|
| 43 |
+
text=document.text,
|
| 44 |
+
source=paper.entry_id,
|
| 45 |
+
metadata=document.metadata,
|
| 46 |
+
)
|
app/extractors/pdf.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from pypdf import PdfReader
|
| 4 |
+
|
| 5 |
+
from app.core.models import Document, SourceType
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def extract_pdf(path: str | Path, title: str | None = None, metadata: dict | None = None) -> Document:
|
| 9 |
+
pdf_path = Path(path)
|
| 10 |
+
if not pdf_path.exists():
|
| 11 |
+
raise FileNotFoundError(f"PDF not found: {pdf_path}")
|
| 12 |
+
|
| 13 |
+
reader = PdfReader(str(pdf_path))
|
| 14 |
+
pages: list[str] = []
|
| 15 |
+
for page_number, page in enumerate(reader.pages, start=1):
|
| 16 |
+
text = page.extract_text() or ""
|
| 17 |
+
if text.strip():
|
| 18 |
+
pages.append(f"\n\n[Page {page_number}]\n{text.strip()}")
|
| 19 |
+
|
| 20 |
+
combined_text = "\n".join(pages).strip()
|
| 21 |
+
if not combined_text:
|
| 22 |
+
raise ValueError("No selectable text was found in this PDF.")
|
| 23 |
+
|
| 24 |
+
return Document(
|
| 25 |
+
source_type=SourceType.PDF,
|
| 26 |
+
title=title or pdf_path.stem,
|
| 27 |
+
text=combined_text,
|
| 28 |
+
source=str(pdf_path),
|
| 29 |
+
metadata={"pages": len(reader.pages), **(metadata or {})},
|
| 30 |
+
)
|
app/extractors/youtube.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from urllib.parse import parse_qs, urlparse
|
| 2 |
+
|
| 3 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 4 |
+
|
| 5 |
+
from app.core.models import Document, SourceType
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _extract_video_id(url: str) -> str:
|
| 9 |
+
parsed = urlparse(url.strip())
|
| 10 |
+
if parsed.netloc.endswith("youtu.be"):
|
| 11 |
+
return parsed.path.strip("/")
|
| 12 |
+
if "youtube.com" in parsed.netloc:
|
| 13 |
+
query = parse_qs(parsed.query)
|
| 14 |
+
if "v" in query:
|
| 15 |
+
return query["v"][0]
|
| 16 |
+
if parsed.path.startswith("/shorts/"):
|
| 17 |
+
return parsed.path.split("/")[2]
|
| 18 |
+
raise ValueError("Could not find a YouTube video ID in the URL.")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def extract_youtube(url: str) -> Document:
|
| 22 |
+
video_id = _extract_video_id(url)
|
| 23 |
+
api = YouTubeTranscriptApi()
|
| 24 |
+
if hasattr(api, "fetch"):
|
| 25 |
+
transcript = api.fetch(video_id)
|
| 26 |
+
transcript_items = transcript.to_raw_data()
|
| 27 |
+
else:
|
| 28 |
+
transcript_items = YouTubeTranscriptApi.get_transcript(video_id)
|
| 29 |
+
|
| 30 |
+
if not transcript_items:
|
| 31 |
+
raise ValueError("No transcript was available for this YouTube video.")
|
| 32 |
+
|
| 33 |
+
lines = []
|
| 34 |
+
for item in transcript_items:
|
| 35 |
+
timestamp = int(item.get("start", 0))
|
| 36 |
+
minutes, seconds = divmod(timestamp, 60)
|
| 37 |
+
text = item.get("text", "").strip()
|
| 38 |
+
if text:
|
| 39 |
+
lines.append(f"[{minutes:02d}:{seconds:02d}] {text}")
|
| 40 |
+
|
| 41 |
+
return Document(
|
| 42 |
+
source_type=SourceType.YOUTUBE,
|
| 43 |
+
title=f"YouTube Transcript {video_id}",
|
| 44 |
+
text="\n".join(lines).strip(),
|
| 45 |
+
source=url,
|
| 46 |
+
metadata={"video_id": video_id, "segments": len(transcript_items)},
|
| 47 |
+
)
|
app/services/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Application service layer."""
|
app/services/chat.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
|
| 5 |
+
from app.core.config import settings
|
| 6 |
+
from app.core.models import SearchResult
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass(frozen=True)
|
| 10 |
+
class ChatAnswer:
|
| 11 |
+
answer: str
|
| 12 |
+
reasoning: str | None
|
| 13 |
+
context: list[SearchResult]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class NvidiaChatClient:
|
| 17 |
+
def __init__(self):
|
| 18 |
+
if not settings.NVIDIA_API_KEY:
|
| 19 |
+
raise ValueError("NVIDIA_API_KEY is required for NVIDIA chat completions.")
|
| 20 |
+
|
| 21 |
+
self.client = OpenAI(
|
| 22 |
+
base_url=settings.NVIDIA_API_URL,
|
| 23 |
+
api_key=settings.NVIDIA_API_KEY,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def answer_with_context(self, question: str, context: list[SearchResult]) -> ChatAnswer:
|
| 27 |
+
context_text = "\n\n".join(
|
| 28 |
+
[
|
| 29 |
+
(
|
| 30 |
+
f"[{index}] title={item.title}\n"
|
| 31 |
+
f"source={item.source}\n"
|
| 32 |
+
f"score={item.score:.4f}\n"
|
| 33 |
+
f"text={item.text}"
|
| 34 |
+
)
|
| 35 |
+
for index, item in enumerate(context, start=1)
|
| 36 |
+
]
|
| 37 |
+
)
|
| 38 |
+
messages = [
|
| 39 |
+
{
|
| 40 |
+
"role": "system",
|
| 41 |
+
"content": (
|
| 42 |
+
"You are KnowledgeHub's retrieval assistant. Answer only from the "
|
| 43 |
+
"provided context. If the context is insufficient, say what is missing. "
|
| 44 |
+
"Cite sources using bracket numbers like [1], [2]."
|
| 45 |
+
),
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"role": "user",
|
| 49 |
+
"content": f"Question:\n{question}\n\nRetrieved context:\n{context_text}",
|
| 50 |
+
},
|
| 51 |
+
]
|
| 52 |
+
completion = self.client.chat.completions.create(
|
| 53 |
+
model=settings.NVIDIA_CHAT_MODEL,
|
| 54 |
+
messages=messages,
|
| 55 |
+
temperature=settings.CHAT_TEMPERATURE,
|
| 56 |
+
top_p=settings.CHAT_TOP_P,
|
| 57 |
+
max_tokens=settings.CHAT_MAX_TOKENS,
|
| 58 |
+
frequency_penalty=0,
|
| 59 |
+
presence_penalty=0,
|
| 60 |
+
stream=False,
|
| 61 |
+
extra_body={
|
| 62 |
+
"min_thinking_tokens": settings.MIN_THINKING_TOKENS,
|
| 63 |
+
"max_thinking_tokens": settings.MAX_THINKING_TOKENS,
|
| 64 |
+
},
|
| 65 |
+
)
|
| 66 |
+
message = completion.choices[0].message
|
| 67 |
+
reasoning = getattr(message, "reasoning_content", None)
|
| 68 |
+
return ChatAnswer(answer=message.content or "", reasoning=reasoning, context=context)
|
app/services/chunking.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import uuid
|
| 3 |
+
|
| 4 |
+
from app.core.models import Chunk, Document
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def chunk_document(document: Document, chunk_size: int, overlap: int) -> list[Chunk]:
|
| 8 |
+
if overlap >= chunk_size:
|
| 9 |
+
raise ValueError("Chunk overlap must be smaller than chunk size.")
|
| 10 |
+
|
| 11 |
+
normalized = re.sub(r"\n{3,}", "\n\n", document.text).strip()
|
| 12 |
+
if not normalized:
|
| 13 |
+
raise ValueError("Document is empty after extraction.")
|
| 14 |
+
|
| 15 |
+
chunks: list[Chunk] = []
|
| 16 |
+
start = 0
|
| 17 |
+
index = 0
|
| 18 |
+
while start < len(normalized):
|
| 19 |
+
end = min(start + chunk_size, len(normalized))
|
| 20 |
+
if end < len(normalized):
|
| 21 |
+
paragraph_break = normalized.rfind("\n\n", start, end)
|
| 22 |
+
sentence_break = normalized.rfind(". ", start, end)
|
| 23 |
+
best_break = max(paragraph_break, sentence_break)
|
| 24 |
+
if best_break > start + chunk_size // 2:
|
| 25 |
+
end = best_break + 1
|
| 26 |
+
|
| 27 |
+
text = normalized[start:end].strip()
|
| 28 |
+
if text:
|
| 29 |
+
digest = str(uuid.uuid5(uuid.NAMESPACE_URL, f"{document.source}:{index}:{text[:80]}"))
|
| 30 |
+
chunks.append(
|
| 31 |
+
Chunk(
|
| 32 |
+
id=digest,
|
| 33 |
+
text=text,
|
| 34 |
+
index=index,
|
| 35 |
+
source_type=document.source_type,
|
| 36 |
+
source=document.source,
|
| 37 |
+
title=document.title,
|
| 38 |
+
metadata=document.metadata,
|
| 39 |
+
)
|
| 40 |
+
)
|
| 41 |
+
index += 1
|
| 42 |
+
|
| 43 |
+
if end == len(normalized):
|
| 44 |
+
break
|
| 45 |
+
start = max(0, end - overlap)
|
| 46 |
+
|
| 47 |
+
return chunks
|
app/services/embeddings.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import cached_property, lru_cache
|
| 2 |
+
|
| 3 |
+
from app.core.config import settings
|
| 4 |
+
from app.utils.zerogpu import is_enabled as zerogpu_is_enabled
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class LocalEmbeddingClient:
|
| 8 |
+
def __init__(self, model: str | None = None, device: str | None = None):
|
| 9 |
+
self.model_name = model or settings.NEMOTRON_EMBED_MODEL
|
| 10 |
+
self.device = device or _resolve_device()
|
| 11 |
+
|
| 12 |
+
@cached_property
|
| 13 |
+
def model(self):
|
| 14 |
+
try:
|
| 15 |
+
from sentence_transformers import SentenceTransformer
|
| 16 |
+
except ImportError as exc:
|
| 17 |
+
raise ImportError(
|
| 18 |
+
"sentence-transformers is required for local embeddings. "
|
| 19 |
+
"Install dependencies with `pip install -r requirements.txt`."
|
| 20 |
+
) from exc
|
| 21 |
+
|
| 22 |
+
return SentenceTransformer(
|
| 23 |
+
self.model_name,
|
| 24 |
+
device=self.device,
|
| 25 |
+
token=settings.HF_TOKEN or None,
|
| 26 |
+
trust_remote_code=True,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
@cached_property
|
| 30 |
+
def native_model(self):
|
| 31 |
+
try:
|
| 32 |
+
from transformers import AutoModel
|
| 33 |
+
except ImportError as exc:
|
| 34 |
+
raise ImportError(
|
| 35 |
+
"transformers is required for native local embeddings. "
|
| 36 |
+
"Install dependencies with `pip install -r requirements.txt`."
|
| 37 |
+
) from exc
|
| 38 |
+
|
| 39 |
+
model = AutoModel.from_pretrained(
|
| 40 |
+
self.model_name,
|
| 41 |
+
token=settings.HF_TOKEN or None,
|
| 42 |
+
trust_remote_code=True,
|
| 43 |
+
dtype="auto" if self.device != "cpu" else None,
|
| 44 |
+
)
|
| 45 |
+
if self.device:
|
| 46 |
+
model = model.to(self.device)
|
| 47 |
+
return model.eval()
|
| 48 |
+
|
| 49 |
+
def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
| 50 |
+
if not texts:
|
| 51 |
+
return []
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
embeddings = self.model.encode(
|
| 55 |
+
texts,
|
| 56 |
+
batch_size=8,
|
| 57 |
+
normalize_embeddings=True,
|
| 58 |
+
show_progress_bar=False,
|
| 59 |
+
)
|
| 60 |
+
return embeddings.tolist()
|
| 61 |
+
except ValueError as exc:
|
| 62 |
+
if "Modality 'text' is not supported" not in str(exc):
|
| 63 |
+
raise
|
| 64 |
+
|
| 65 |
+
embeddings = self._embed_with_native_query_encoder(texts)
|
| 66 |
+
return embeddings.tolist()
|
| 67 |
+
|
| 68 |
+
def _embed_with_native_query_encoder(self, texts: list[str]):
|
| 69 |
+
try:
|
| 70 |
+
import torch
|
| 71 |
+
import torch.nn.functional as F
|
| 72 |
+
except ImportError as exc:
|
| 73 |
+
raise ImportError(
|
| 74 |
+
"torch is required for the native Nemotron embedding path. "
|
| 75 |
+
"Install dependencies with `pip install -r requirements.txt`."
|
| 76 |
+
) from exc
|
| 77 |
+
|
| 78 |
+
if not hasattr(self.native_model, "forward_queries"):
|
| 79 |
+
raise ValueError(
|
| 80 |
+
f"{self.model_name} does not support SentenceTransformer text encoding "
|
| 81 |
+
"or a native forward_queries API."
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
with torch.no_grad():
|
| 85 |
+
output = self.native_model.forward_queries(texts, batch_size=4)
|
| 86 |
+
|
| 87 |
+
if isinstance(output, (list, tuple)):
|
| 88 |
+
output = output[0]
|
| 89 |
+
|
| 90 |
+
if not torch.is_tensor(output):
|
| 91 |
+
output = torch.as_tensor(output)
|
| 92 |
+
|
| 93 |
+
if output.ndim == 3:
|
| 94 |
+
output = output.float().mean(dim=1)
|
| 95 |
+
elif output.ndim != 2:
|
| 96 |
+
raise ValueError(f"Unexpected embedding shape from {self.model_name}: {tuple(output.shape)}")
|
| 97 |
+
|
| 98 |
+
return F.normalize(output.float(), p=2, dim=1).cpu()
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@lru_cache(maxsize=1)
|
| 102 |
+
def get_embedding_client() -> LocalEmbeddingClient:
|
| 103 |
+
return LocalEmbeddingClient()
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def _resolve_device() -> str:
|
| 107 |
+
if zerogpu_is_enabled() and settings.EMBEDDING_DEVICE == "cpu":
|
| 108 |
+
return "cuda"
|
| 109 |
+
return settings.EMBEDDING_DEVICE
|
app/services/ingestion.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from app.core.config import settings
|
| 4 |
+
from app.core.models import Document, IngestionResult, SourceType
|
| 5 |
+
from app.extractors.arxiv import extract_arxiv
|
| 6 |
+
from app.extractors.pdf import extract_pdf
|
| 7 |
+
from app.extractors.youtube import extract_youtube
|
| 8 |
+
from app.services.chat import NvidiaChatClient
|
| 9 |
+
from app.services.chunking import chunk_document
|
| 10 |
+
from app.services.embeddings import get_embedding_client
|
| 11 |
+
from app.services.vector_store import QdrantVectorStore
|
| 12 |
+
from app.utils.source_detection import detect_source
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
EXPORT_DIR = Path("data/exports")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def extract_document(url: str | None = None, pdf_path: str | None = None) -> Document:
|
| 19 |
+
source_type = detect_source(url, pdf_path)
|
| 20 |
+
if source_type == SourceType.PDF:
|
| 21 |
+
return extract_pdf(str(pdf_path))
|
| 22 |
+
if source_type == SourceType.ARXIV:
|
| 23 |
+
return extract_arxiv(str(url))
|
| 24 |
+
if source_type == SourceType.YOUTUBE:
|
| 25 |
+
return extract_youtube(str(url))
|
| 26 |
+
raise ValueError(f"Unsupported source type: {source_type}")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def save_markdown(document: Document, chunks_count: int) -> Path:
|
| 30 |
+
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
|
| 31 |
+
safe_title = "".join(char if char.isalnum() or char in "-_" else "_" for char in document.title)[:80]
|
| 32 |
+
path = EXPORT_DIR / f"{safe_title or document.source_type.value}.md"
|
| 33 |
+
metadata_lines = "\n".join(f"- {key}: {value}" for key, value in document.metadata.items())
|
| 34 |
+
path.write_text(
|
| 35 |
+
"\n".join(
|
| 36 |
+
[
|
| 37 |
+
f"# {document.title}",
|
| 38 |
+
"",
|
| 39 |
+
f"- Source type: {document.source_type.value}",
|
| 40 |
+
f"- Source: {document.source}",
|
| 41 |
+
f"- Chunks uploaded: {chunks_count}",
|
| 42 |
+
metadata_lines,
|
| 43 |
+
"",
|
| 44 |
+
"## Extracted Text",
|
| 45 |
+
"",
|
| 46 |
+
document.text,
|
| 47 |
+
]
|
| 48 |
+
),
|
| 49 |
+
encoding="utf-8",
|
| 50 |
+
)
|
| 51 |
+
return path
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def ingest_source(
|
| 55 |
+
url: str | None,
|
| 56 |
+
pdf_path: str | None,
|
| 57 |
+
chunk_size: int | None = None,
|
| 58 |
+
chunk_overlap: int | None = None,
|
| 59 |
+
collection_name: str | None = None,
|
| 60 |
+
) -> IngestionResult:
|
| 61 |
+
document = extract_document(url=url, pdf_path=pdf_path)
|
| 62 |
+
chunks = chunk_document(
|
| 63 |
+
document,
|
| 64 |
+
chunk_size=chunk_size or settings.CHUNK_SIZE,
|
| 65 |
+
overlap=chunk_overlap or settings.CHUNK_OVERLAP,
|
| 66 |
+
)
|
| 67 |
+
embeddings = get_embedding_client().embed_texts([chunk.text for chunk in chunks])
|
| 68 |
+
store = QdrantVectorStore(collection_name=collection_name)
|
| 69 |
+
store.upsert_chunks(chunks, embeddings)
|
| 70 |
+
export_path = save_markdown(document, len(chunks))
|
| 71 |
+
return IngestionResult(
|
| 72 |
+
document=document,
|
| 73 |
+
chunks=chunks,
|
| 74 |
+
collection_name=store.collection_name,
|
| 75 |
+
export_path=export_path,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def search_knowledge_base(query: str, limit: int = 5, collection_name: str | None = None):
|
| 80 |
+
query_text = query.strip()
|
| 81 |
+
if not query_text:
|
| 82 |
+
raise ValueError("Enter a query to search.")
|
| 83 |
+
embedding = get_embedding_client().embed_texts([query_text])[0]
|
| 84 |
+
return QdrantVectorStore(collection_name=collection_name).search(embedding, limit=limit)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def answer_from_knowledge_base(query: str, limit: int = 5, collection_name: str | None = None):
|
| 88 |
+
results = search_knowledge_base(query, limit=limit, collection_name=collection_name)
|
| 89 |
+
return NvidiaChatClient().answer_with_context(query, results)
|
app/services/vector_store.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from qdrant_client import QdrantClient
|
| 2 |
+
from qdrant_client.http.models import Distance, PointStruct, VectorParams
|
| 3 |
+
|
| 4 |
+
from app.core.config import settings
|
| 5 |
+
from app.core.models import Chunk, SearchResult
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class QdrantVectorStore:
|
| 9 |
+
def __init__(self, collection_name: str | None = None):
|
| 10 |
+
self.collection_name = collection_name or settings.QDRANT_COLLECTION_NAME
|
| 11 |
+
self.client = QdrantClient(
|
| 12 |
+
url=settings.get_qdrant_url(),
|
| 13 |
+
api_key=settings.QDRANT_API_KEY or None,
|
| 14 |
+
timeout=60,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
def ensure_collection(self, vector_size: int) -> None:
|
| 18 |
+
collections = self.client.get_collections().collections
|
| 19 |
+
exists = any(collection.name == self.collection_name for collection in collections)
|
| 20 |
+
if not exists:
|
| 21 |
+
self.client.create_collection(
|
| 22 |
+
collection_name=self.collection_name,
|
| 23 |
+
vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def upsert_chunks(self, chunks: list[Chunk], embeddings: list[list[float]]) -> None:
|
| 27 |
+
if len(chunks) != len(embeddings):
|
| 28 |
+
raise ValueError("Chunks and embeddings must have the same length.")
|
| 29 |
+
if not chunks:
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
self.ensure_collection(vector_size=len(embeddings[0]))
|
| 33 |
+
points = [
|
| 34 |
+
PointStruct(
|
| 35 |
+
id=chunk.id,
|
| 36 |
+
vector=embedding,
|
| 37 |
+
payload={
|
| 38 |
+
"text": chunk.text,
|
| 39 |
+
"chunk_index": chunk.index,
|
| 40 |
+
"source_type": chunk.source_type.value,
|
| 41 |
+
"source": chunk.source,
|
| 42 |
+
"title": chunk.title,
|
| 43 |
+
"metadata": chunk.metadata,
|
| 44 |
+
},
|
| 45 |
+
)
|
| 46 |
+
for chunk, embedding in zip(chunks, embeddings, strict=True)
|
| 47 |
+
]
|
| 48 |
+
self.client.upsert(collection_name=self.collection_name, points=points)
|
| 49 |
+
|
| 50 |
+
def search(self, query_embedding: list[float], limit: int = 5) -> list[SearchResult]:
|
| 51 |
+
if hasattr(self.client, "query_points"):
|
| 52 |
+
response = self.client.query_points(
|
| 53 |
+
collection_name=self.collection_name,
|
| 54 |
+
query=query_embedding,
|
| 55 |
+
limit=limit,
|
| 56 |
+
with_payload=True,
|
| 57 |
+
)
|
| 58 |
+
hits = response.points
|
| 59 |
+
else:
|
| 60 |
+
hits = self.client.search(
|
| 61 |
+
collection_name=self.collection_name,
|
| 62 |
+
query_vector=query_embedding,
|
| 63 |
+
limit=limit,
|
| 64 |
+
with_payload=True,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
results: list[SearchResult] = []
|
| 68 |
+
for hit in hits:
|
| 69 |
+
payload = hit.payload or {}
|
| 70 |
+
results.append(
|
| 71 |
+
SearchResult(
|
| 72 |
+
score=float(hit.score),
|
| 73 |
+
text=str(payload.get("text", "")),
|
| 74 |
+
title=str(payload.get("title", "")),
|
| 75 |
+
source=str(payload.get("source", "")),
|
| 76 |
+
source_type=str(payload.get("source_type", "")),
|
| 77 |
+
metadata=dict(payload.get("metadata", {})),
|
| 78 |
+
)
|
| 79 |
+
)
|
| 80 |
+
return results
|
app/ui/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""UI package."""
|
app/ui/gradio_app.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import traceback
|
| 3 |
+
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
from app.core.config import settings
|
| 7 |
+
from app.ui.theme import CSS, HEAD, JS
|
| 8 |
+
from app.utils.zerogpu import gpu
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
THEME = gr.themes.Base(
|
| 13 |
+
primary_hue="cyan",
|
| 14 |
+
secondary_hue="lime",
|
| 15 |
+
neutral_hue="slate",
|
| 16 |
+
radius_size="sm",
|
| 17 |
+
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
|
| 18 |
+
font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "ui-monospace", "monospace"],
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _format_metadata(metadata: dict) -> str:
|
| 23 |
+
if not metadata:
|
| 24 |
+
return "No metadata found."
|
| 25 |
+
rows = []
|
| 26 |
+
for key, value in metadata.items():
|
| 27 |
+
rows.append(f"**{key}**: {value}")
|
| 28 |
+
return "\n\n".join(rows)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@gpu()
|
| 32 |
+
def _ingest(url: str, pdf_file: str | None, chunk_size: int, chunk_overlap: int, collection_name: str):
|
| 33 |
+
logger.info(
|
| 34 |
+
"Ingest requested url=%s pdf_file=%s chunk_size=%s chunk_overlap=%s collection=%s",
|
| 35 |
+
url,
|
| 36 |
+
pdf_file,
|
| 37 |
+
chunk_size,
|
| 38 |
+
chunk_overlap,
|
| 39 |
+
collection_name,
|
| 40 |
+
)
|
| 41 |
+
try:
|
| 42 |
+
from app.services.ingestion import ingest_source
|
| 43 |
+
|
| 44 |
+
result = ingest_source(
|
| 45 |
+
url=url,
|
| 46 |
+
pdf_path=pdf_file,
|
| 47 |
+
chunk_size=chunk_size,
|
| 48 |
+
chunk_overlap=chunk_overlap,
|
| 49 |
+
collection_name=collection_name,
|
| 50 |
+
)
|
| 51 |
+
document = result.document
|
| 52 |
+
status = (
|
| 53 |
+
f"### Ingestion complete\n\n"
|
| 54 |
+
f"Uploaded **{len(result.chunks)} chunks** into Qdrant collection "
|
| 55 |
+
f"`{result.collection_name}`.\n\n"
|
| 56 |
+
f"Saved extracted text to `{result.export_path}`."
|
| 57 |
+
)
|
| 58 |
+
preview = document.text[:12000]
|
| 59 |
+
if len(document.text) > len(preview):
|
| 60 |
+
preview += "\n\n[Preview truncated in UI. Full text is saved in the export file.]"
|
| 61 |
+
return (
|
| 62 |
+
status,
|
| 63 |
+
document.title,
|
| 64 |
+
document.source_type.value,
|
| 65 |
+
str(len(document.text)),
|
| 66 |
+
str(len(result.chunks)),
|
| 67 |
+
_format_metadata(document.metadata),
|
| 68 |
+
preview,
|
| 69 |
+
str(result.export_path),
|
| 70 |
+
)
|
| 71 |
+
except Exception as exc:
|
| 72 |
+
return (
|
| 73 |
+
f"### Ingestion failed\n\n`{type(exc).__name__}: {exc}`\n\n```text\n{traceback.format_exc(limit=2)}\n```",
|
| 74 |
+
"",
|
| 75 |
+
"",
|
| 76 |
+
"0",
|
| 77 |
+
"0",
|
| 78 |
+
"",
|
| 79 |
+
"",
|
| 80 |
+
"",
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@gpu()
|
| 85 |
+
def _search(query: str, limit: int, collection_name: str):
|
| 86 |
+
logger.info("Search requested query=%s limit=%s collection=%s", query, limit, collection_name)
|
| 87 |
+
try:
|
| 88 |
+
from app.services.ingestion import search_knowledge_base
|
| 89 |
+
|
| 90 |
+
results = search_knowledge_base(query, limit=limit, collection_name=collection_name)
|
| 91 |
+
except Exception as exc:
|
| 92 |
+
if "MPS backend out of memory" in str(exc):
|
| 93 |
+
return (
|
| 94 |
+
"### Search failed\n\n"
|
| 95 |
+
"The local embedding model ran out of Apple GPU memory. "
|
| 96 |
+
"Restart the app so the new CPU embedding setting takes effect. "
|
| 97 |
+
"Keep `EMBEDDING_DEVICE=cpu` in `.env`."
|
| 98 |
+
)
|
| 99 |
+
return f"### Search failed\n\n`{type(exc).__name__}: {exc}`"
|
| 100 |
+
|
| 101 |
+
if not results:
|
| 102 |
+
return "No matches found."
|
| 103 |
+
|
| 104 |
+
blocks = []
|
| 105 |
+
for index, result in enumerate(results, start=1):
|
| 106 |
+
excerpt = result.text[:1200]
|
| 107 |
+
blocks.append(
|
| 108 |
+
"\n".join(
|
| 109 |
+
[
|
| 110 |
+
f"### {index}. {result.title}",
|
| 111 |
+
f"**Score:** {result.score:.4f}",
|
| 112 |
+
f"**Source:** {result.source_type} | {result.source}",
|
| 113 |
+
"",
|
| 114 |
+
excerpt,
|
| 115 |
+
]
|
| 116 |
+
)
|
| 117 |
+
)
|
| 118 |
+
return "\n\n---\n\n".join(blocks)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
@gpu()
|
| 122 |
+
def _answer(query: str, limit: int, collection_name: str):
|
| 123 |
+
logger.info("Answer requested query=%s limit=%s collection=%s", query, limit, collection_name)
|
| 124 |
+
try:
|
| 125 |
+
from app.services.ingestion import answer_from_knowledge_base
|
| 126 |
+
|
| 127 |
+
result = answer_from_knowledge_base(query, limit=limit, collection_name=collection_name)
|
| 128 |
+
except Exception as exc:
|
| 129 |
+
if "MPS backend out of memory" in str(exc):
|
| 130 |
+
return (
|
| 131 |
+
"### Answer failed\n\n"
|
| 132 |
+
"The local embedding model ran out of Apple GPU memory. "
|
| 133 |
+
"Restart the app so the new CPU embedding setting takes effect. "
|
| 134 |
+
"Keep `EMBEDDING_DEVICE=cpu` in `.env`.",
|
| 135 |
+
"",
|
| 136 |
+
"",
|
| 137 |
+
)
|
| 138 |
+
return f"### Answer failed\n\n`{type(exc).__name__}: {exc}`", "", ""
|
| 139 |
+
|
| 140 |
+
context_blocks = []
|
| 141 |
+
for index, item in enumerate(result.context, start=1):
|
| 142 |
+
context_blocks.append(
|
| 143 |
+
"\n".join(
|
| 144 |
+
[
|
| 145 |
+
f"### [{index}] {item.title}",
|
| 146 |
+
f"**Score:** {item.score:.4f}",
|
| 147 |
+
f"**Source:** {item.source_type} | {item.source}",
|
| 148 |
+
"",
|
| 149 |
+
item.text[:1000],
|
| 150 |
+
]
|
| 151 |
+
)
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
reasoning = result.reasoning or "No reasoning content was returned by the API."
|
| 155 |
+
return result.answer, reasoning, "\n\n---\n\n".join(context_blocks)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def build_app() -> gr.Blocks:
|
| 159 |
+
with gr.Blocks(
|
| 160 |
+
title=f"{settings.PROJECT_NAME} Ingestor",
|
| 161 |
+
) as demo:
|
| 162 |
+
with gr.Column(elem_id="kh-shell"):
|
| 163 |
+
gr.Markdown(
|
| 164 |
+
f"""
|
| 165 |
+
# {settings.PROJECT_NAME}
|
| 166 |
+
Turn papers, PDFs, and videos into a searchable vector memory.
|
| 167 |
+
|
| 168 |
+
Extract text, chunk it cleanly, embed locally, and use NVIDIA chat for grounded answers.
|
| 169 |
+
""",
|
| 170 |
+
elem_id="kh-title",
|
| 171 |
+
)
|
| 172 |
+
gr.HTML(
|
| 173 |
+
f"""
|
| 174 |
+
<div class="kh-chip-row">
|
| 175 |
+
<div class="kh-chip">Embeddings <code>{settings.NEMOTRON_EMBED_MODEL}</code></div>
|
| 176 |
+
<div class="kh-chip">Parser <code>{settings.NEMOTRON_PARSE_MODEL}</code></div>
|
| 177 |
+
<div class="kh-chip">Chat <code>{settings.NVIDIA_CHAT_MODEL}</code></div>
|
| 178 |
+
<div class="kh-chip">Collection <code>{settings.QDRANT_COLLECTION_NAME}</code></div>
|
| 179 |
+
<div class="kh-chip">Sources PDF · arXiv · YouTube</div>
|
| 180 |
+
</div>
|
| 181 |
+
""",
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
with gr.Tabs():
|
| 185 |
+
with gr.Tab("Ingest"):
|
| 186 |
+
with gr.Row(equal_height=True):
|
| 187 |
+
with gr.Column(scale=5, elem_classes=["kh-panel"]):
|
| 188 |
+
gr.Markdown(
|
| 189 |
+
"### Source Intake\n<div class='kh-subhead'>Upload a PDF or paste one link. The pipeline handles extraction, chunking, local embeddings, and Qdrant upload.</div>"
|
| 190 |
+
)
|
| 191 |
+
source_url = gr.Textbox(
|
| 192 |
+
label="YouTube or arXiv input",
|
| 193 |
+
placeholder="Paste a YouTube URL, arXiv URL, or arXiv ID",
|
| 194 |
+
lines=2,
|
| 195 |
+
)
|
| 196 |
+
pdf_file = gr.File(
|
| 197 |
+
label="PDF document",
|
| 198 |
+
file_types=[".pdf"],
|
| 199 |
+
type="filepath",
|
| 200 |
+
)
|
| 201 |
+
with gr.Row():
|
| 202 |
+
chunk_size = gr.Slider(
|
| 203 |
+
400,
|
| 204 |
+
2500,
|
| 205 |
+
value=settings.CHUNK_SIZE,
|
| 206 |
+
step=50,
|
| 207 |
+
label="Chunk size",
|
| 208 |
+
)
|
| 209 |
+
chunk_overlap = gr.Slider(
|
| 210 |
+
0,
|
| 211 |
+
600,
|
| 212 |
+
value=settings.CHUNK_OVERLAP,
|
| 213 |
+
step=25,
|
| 214 |
+
label="Chunk overlap",
|
| 215 |
+
)
|
| 216 |
+
collection_name_ingest = gr.Textbox(
|
| 217 |
+
label="Collection Name",
|
| 218 |
+
value=settings.QDRANT_COLLECTION_NAME,
|
| 219 |
+
placeholder="Enter Qdrant collection name",
|
| 220 |
+
)
|
| 221 |
+
ingest_btn = gr.Button("Ingest into Qdrant", variant="primary")
|
| 222 |
+
|
| 223 |
+
with gr.Column(scale=4, elem_classes=["kh-panel"]):
|
| 224 |
+
gr.Markdown("### Pipeline Status")
|
| 225 |
+
status = gr.Markdown(elem_id="kh-status")
|
| 226 |
+
with gr.Row():
|
| 227 |
+
title = gr.Textbox(
|
| 228 |
+
label="Title",
|
| 229 |
+
interactive=False,
|
| 230 |
+
elem_classes=["kh-stat"],
|
| 231 |
+
)
|
| 232 |
+
source_type = gr.Textbox(
|
| 233 |
+
label="Type",
|
| 234 |
+
interactive=False,
|
| 235 |
+
elem_classes=["kh-stat"],
|
| 236 |
+
)
|
| 237 |
+
with gr.Row():
|
| 238 |
+
char_count = gr.Textbox(
|
| 239 |
+
label="Characters",
|
| 240 |
+
interactive=False,
|
| 241 |
+
elem_classes=["kh-stat"],
|
| 242 |
+
)
|
| 243 |
+
chunk_count = gr.Textbox(
|
| 244 |
+
label="Chunks",
|
| 245 |
+
interactive=False,
|
| 246 |
+
elem_classes=["kh-stat"],
|
| 247 |
+
)
|
| 248 |
+
export_path = gr.Textbox(label="Export file", interactive=False)
|
| 249 |
+
|
| 250 |
+
with gr.Row(equal_height=True):
|
| 251 |
+
metadata = gr.Markdown(label="Metadata", elem_classes=["kh-panel"])
|
| 252 |
+
text_preview = gr.Textbox(
|
| 253 |
+
label="Extracted text preview",
|
| 254 |
+
lines=18,
|
| 255 |
+
interactive=False,
|
| 256 |
+
elem_id="kh-text-preview",
|
| 257 |
+
elem_classes=["kh-panel"],
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
ingest_btn.click(
|
| 261 |
+
fn=_ingest,
|
| 262 |
+
inputs=[source_url, pdf_file, chunk_size, chunk_overlap, collection_name_ingest],
|
| 263 |
+
outputs=[
|
| 264 |
+
status,
|
| 265 |
+
title,
|
| 266 |
+
source_type,
|
| 267 |
+
char_count,
|
| 268 |
+
chunk_count,
|
| 269 |
+
metadata,
|
| 270 |
+
text_preview,
|
| 271 |
+
export_path,
|
| 272 |
+
],
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
with gr.Tab("Retrieve"):
|
| 276 |
+
with gr.Row(equal_height=True):
|
| 277 |
+
with gr.Column(scale=3, elem_classes=["kh-panel"]):
|
| 278 |
+
gr.Markdown(
|
| 279 |
+
"### Retrieval Probe\n<div class='kh-subhead'>Run a quick similarity search against the same Qdrant collection after ingestion.</div>"
|
| 280 |
+
)
|
| 281 |
+
query = gr.Textbox(
|
| 282 |
+
label="Search query",
|
| 283 |
+
placeholder="Ask a question or enter keywords",
|
| 284 |
+
lines=4,
|
| 285 |
+
)
|
| 286 |
+
limit = gr.Slider(1, 10, value=5, step=1, label="Results")
|
| 287 |
+
collection_name_retrieve = gr.Textbox(
|
| 288 |
+
label="Collection Name",
|
| 289 |
+
value=settings.QDRANT_COLLECTION_NAME,
|
| 290 |
+
placeholder="Enter Qdrant collection name",
|
| 291 |
+
)
|
| 292 |
+
with gr.Row():
|
| 293 |
+
search_btn = gr.Button("Search Qdrant", variant="secondary")
|
| 294 |
+
answer_btn = gr.Button("Answer with NVIDIA", variant="primary")
|
| 295 |
+
with gr.Column(scale=5, elem_classes=["kh-panel"]):
|
| 296 |
+
gr.Markdown("### Answer")
|
| 297 |
+
answer_output = gr.Markdown(elem_id="kh-answer")
|
| 298 |
+
|
| 299 |
+
with gr.Row(equal_height=True):
|
| 300 |
+
with gr.Column(elem_classes=["kh-panel"]):
|
| 301 |
+
gr.Markdown("### Matches")
|
| 302 |
+
search_results = gr.Markdown(elem_id="kh-search-results")
|
| 303 |
+
with gr.Column(elem_classes=["kh-panel"]):
|
| 304 |
+
gr.Markdown("### Reasoning")
|
| 305 |
+
reasoning_output = gr.Markdown(elem_id="kh-reasoning")
|
| 306 |
+
|
| 307 |
+
search_btn.click(
|
| 308 |
+
fn=_search,
|
| 309 |
+
inputs=[query, limit, collection_name_retrieve],
|
| 310 |
+
outputs=search_results,
|
| 311 |
+
)
|
| 312 |
+
answer_btn.click(
|
| 313 |
+
fn=_answer,
|
| 314 |
+
inputs=[query, limit, collection_name_retrieve],
|
| 315 |
+
outputs=[answer_output, reasoning_output, search_results],
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
# Sync collection names across tabs
|
| 319 |
+
collection_name_ingest.change(
|
| 320 |
+
fn=lambda x: x, inputs=[collection_name_ingest], outputs=[collection_name_retrieve]
|
| 321 |
+
)
|
| 322 |
+
collection_name_retrieve.change(
|
| 323 |
+
fn=lambda x: x, inputs=[collection_name_retrieve], outputs=[collection_name_ingest]
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
return demo
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def serve() -> None:
|
| 330 |
+
logger.info("Building Gradio app")
|
| 331 |
+
demo = build_app()
|
| 332 |
+
logger.info("Launching Gradio server on 0.0.0.0:7860")
|
| 333 |
+
demo.queue().launch(
|
| 334 |
+
server_name="0.0.0.0",
|
| 335 |
+
server_port=7860,
|
| 336 |
+
show_error=True,
|
| 337 |
+
theme=THEME,
|
| 338 |
+
css=CSS,
|
| 339 |
+
js=JS,
|
| 340 |
+
head=HEAD,
|
| 341 |
+
)
|
app/ui/theme.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
HEAD = """
|
| 2 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 3 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 4 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet">
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
JS = """
|
| 8 |
+
() => {
|
| 9 |
+
const root = document.querySelector('.gradio-container');
|
| 10 |
+
if (!root) return;
|
| 11 |
+
root.dataset.ready = 'true';
|
| 12 |
+
const marker = document.createElement('div');
|
| 13 |
+
marker.className = 'kh-scanline';
|
| 14 |
+
root.prepend(marker);
|
| 15 |
+
}
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
CSS = """
|
| 19 |
+
:root {
|
| 20 |
+
--kh-bg: #080b0f;
|
| 21 |
+
--kh-surface: rgba(18, 24, 32, 0.78);
|
| 22 |
+
--kh-surface-strong: rgba(27, 36, 48, 0.92);
|
| 23 |
+
--kh-ink: #f7fbff;
|
| 24 |
+
--kh-muted: #a7b4c2;
|
| 25 |
+
--kh-soft: #d8e1ea;
|
| 26 |
+
--kh-line: rgba(255, 255, 255, 0.12);
|
| 27 |
+
--kh-cyan: #20d6c7;
|
| 28 |
+
--kh-lime: #b8f45d;
|
| 29 |
+
--kh-rose: #ff6b8a;
|
| 30 |
+
--kh-amber: #ffcf5c;
|
| 31 |
+
--kh-shadow: rgba(0, 0, 0, 0.32);
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
.gradio-container {
|
| 35 |
+
min-height: 100vh;
|
| 36 |
+
background:
|
| 37 |
+
radial-gradient(circle at 18% 8%, rgba(32, 214, 199, 0.22), transparent 30%),
|
| 38 |
+
radial-gradient(circle at 86% 12%, rgba(255, 207, 92, 0.16), transparent 28%),
|
| 39 |
+
linear-gradient(135deg, #080b0f 0%, #101720 48%, #0b1017 100%) !important;
|
| 40 |
+
color: var(--kh-ink);
|
| 41 |
+
font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.kh-scanline {
|
| 45 |
+
position: fixed;
|
| 46 |
+
inset: 0;
|
| 47 |
+
pointer-events: none;
|
| 48 |
+
background-image: linear-gradient(rgba(255,255,255,0.035) 1px, transparent 1px);
|
| 49 |
+
background-size: 100% 4px;
|
| 50 |
+
mask-image: linear-gradient(to bottom, transparent, black 18%, black 72%, transparent);
|
| 51 |
+
opacity: 0.18;
|
| 52 |
+
z-index: 0;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
#kh-shell {
|
| 56 |
+
position: relative;
|
| 57 |
+
z-index: 1;
|
| 58 |
+
max-width: 1220px;
|
| 59 |
+
margin: 0 auto;
|
| 60 |
+
padding: 28px 18px 42px;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
#kh-title {
|
| 64 |
+
padding: 34px 0 22px;
|
| 65 |
+
border-bottom: 1px solid var(--kh-line);
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
#kh-title h1 {
|
| 69 |
+
max-width: 920px;
|
| 70 |
+
color: var(--kh-ink);
|
| 71 |
+
font-size: clamp(2.6rem, 6vw, 6rem);
|
| 72 |
+
font-weight: 800;
|
| 73 |
+
line-height: 0.9;
|
| 74 |
+
margin: 0 0 14px;
|
| 75 |
+
letter-spacing: 0;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
#kh-title p {
|
| 79 |
+
max-width: 780px;
|
| 80 |
+
color: var(--kh-muted);
|
| 81 |
+
font-size: 1.04rem;
|
| 82 |
+
line-height: 1.65;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
#kh-title code,
|
| 86 |
+
.kh-chip code {
|
| 87 |
+
color: var(--kh-lime);
|
| 88 |
+
background: rgba(184, 244, 93, 0.09);
|
| 89 |
+
border: 1px solid rgba(184, 244, 93, 0.18);
|
| 90 |
+
border-radius: 6px;
|
| 91 |
+
padding: 2px 6px;
|
| 92 |
+
font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, monospace;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
.kh-panel {
|
| 96 |
+
border: 1px solid var(--kh-line);
|
| 97 |
+
border-radius: 8px;
|
| 98 |
+
background: linear-gradient(180deg, var(--kh-surface-strong), var(--kh-surface));
|
| 99 |
+
box-shadow: 0 24px 70px var(--kh-shadow);
|
| 100 |
+
backdrop-filter: blur(18px);
|
| 101 |
+
padding: 18px !important;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.kh-panel label,
|
| 105 |
+
.kh-panel .label-wrap span {
|
| 106 |
+
color: var(--kh-soft) !important;
|
| 107 |
+
font-weight: 700 !important;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
.kh-subhead {
|
| 111 |
+
margin: 8px 0 16px;
|
| 112 |
+
color: var(--kh-muted);
|
| 113 |
+
font-size: 0.95rem;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
.kh-chip-row {
|
| 117 |
+
display: flex;
|
| 118 |
+
flex-wrap: wrap;
|
| 119 |
+
gap: 10px;
|
| 120 |
+
margin-top: 18px;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.kh-chip {
|
| 124 |
+
border: 1px solid var(--kh-line);
|
| 125 |
+
border-radius: 999px;
|
| 126 |
+
padding: 8px 12px;
|
| 127 |
+
color: var(--kh-soft);
|
| 128 |
+
background: rgba(255, 255, 255, 0.055);
|
| 129 |
+
font-size: 0.9rem;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
.kh-stat {
|
| 133 |
+
min-height: 92px;
|
| 134 |
+
border: 1px solid var(--kh-line);
|
| 135 |
+
border-radius: 8px;
|
| 136 |
+
padding: 14px 16px;
|
| 137 |
+
background: rgba(255, 255, 255, 0.055);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
.kh-stat .wrap,
|
| 141 |
+
.kh-stat input {
|
| 142 |
+
background: transparent !important;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
.tabs {
|
| 146 |
+
margin-top: 20px;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.tab-nav button {
|
| 150 |
+
color: var(--kh-muted) !important;
|
| 151 |
+
border-radius: 8px !important;
|
| 152 |
+
font-weight: 700 !important;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
.tab-nav button.selected {
|
| 156 |
+
color: var(--kh-ink) !important;
|
| 157 |
+
background: linear-gradient(135deg, rgba(32, 214, 199, 0.22), rgba(184, 244, 93, 0.12)) !important;
|
| 158 |
+
border: 1px solid rgba(32, 214, 199, 0.34) !important;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
textarea,
|
| 162 |
+
input {
|
| 163 |
+
color: var(--kh-ink) !important;
|
| 164 |
+
background: rgba(3, 7, 12, 0.52) !important;
|
| 165 |
+
border-color: rgba(255, 255, 255, 0.12) !important;
|
| 166 |
+
font-size: 0.96rem !important;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
textarea::placeholder,
|
| 170 |
+
input::placeholder {
|
| 171 |
+
color: rgba(216, 225, 234, 0.46) !important;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
#kh-status {
|
| 175 |
+
min-height: 130px;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
#kh-status h3 {
|
| 179 |
+
color: var(--kh-lime);
|
| 180 |
+
margin-top: 0;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
#kh-text-preview textarea {
|
| 184 |
+
min-height: 430px !important;
|
| 185 |
+
line-height: 1.6 !important;
|
| 186 |
+
font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, monospace !important;
|
| 187 |
+
font-size: 0.9rem !important;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
#kh-search-results {
|
| 191 |
+
min-height: 410px;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
#kh-answer,
|
| 195 |
+
#kh-reasoning {
|
| 196 |
+
min-height: 240px;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
#kh-answer {
|
| 200 |
+
font-size: 1.02rem;
|
| 201 |
+
line-height: 1.7;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
#kh-search-results h3 {
|
| 205 |
+
color: var(--kh-cyan);
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
#kh-reasoning {
|
| 209 |
+
color: var(--kh-muted);
|
| 210 |
+
font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, monospace;
|
| 211 |
+
font-size: 0.86rem;
|
| 212 |
+
line-height: 1.6;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
.prose,
|
| 216 |
+
.markdown {
|
| 217 |
+
color: var(--kh-soft) !important;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
.prose strong,
|
| 221 |
+
.markdown strong {
|
| 222 |
+
color: var(--kh-ink) !important;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
button.primary {
|
| 226 |
+
min-height: 46px;
|
| 227 |
+
background: linear-gradient(135deg, var(--kh-cyan), var(--kh-lime)) !important;
|
| 228 |
+
color: #061015 !important;
|
| 229 |
+
border: 0 !important;
|
| 230 |
+
border-radius: 8px !important;
|
| 231 |
+
font-weight: 800 !important;
|
| 232 |
+
box-shadow: 0 16px 34px rgba(32, 214, 199, 0.2);
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
button.secondary {
|
| 236 |
+
border-radius: 8px !important;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
.file-preview,
|
| 240 |
+
.upload-container {
|
| 241 |
+
border-color: rgba(32, 214, 199, 0.26) !important;
|
| 242 |
+
background: rgba(32, 214, 199, 0.055) !important;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
@media (max-width: 760px) {
|
| 246 |
+
#kh-shell {
|
| 247 |
+
padding: 18px 10px 32px;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
#kh-title h1 {
|
| 251 |
+
font-size: clamp(2.25rem, 15vw, 4.2rem);
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
.kh-panel {
|
| 255 |
+
padding: 14px !important;
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
"""
|
app/utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Utility helpers."""
|
app/utils/source_detection.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from urllib.parse import urlparse
|
| 4 |
+
|
| 5 |
+
from app.core.models import SourceType
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
ARXIV_RE = re.compile(r"(?:arxiv\.org/(?:abs|pdf)/)?(?P<id>\d{4}\.\d{4,5})(?:v\d+)?", re.I)
|
| 9 |
+
YOUTUBE_HOSTS = {"youtube.com", "www.youtube.com", "m.youtube.com", "youtu.be", "www.youtu.be"}
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def detect_source(url: str | None, pdf_path: str | None) -> SourceType:
|
| 13 |
+
if pdf_path:
|
| 14 |
+
suffix = Path(pdf_path).suffix.lower()
|
| 15 |
+
if suffix == ".pdf":
|
| 16 |
+
return SourceType.PDF
|
| 17 |
+
raise ValueError("Uploaded file must be a PDF.")
|
| 18 |
+
|
| 19 |
+
if not url or not url.strip():
|
| 20 |
+
raise ValueError("Provide a YouTube link, arXiv link/ID, or upload a PDF.")
|
| 21 |
+
|
| 22 |
+
clean_url = url.strip()
|
| 23 |
+
parsed = urlparse(clean_url)
|
| 24 |
+
host = parsed.netloc.lower()
|
| 25 |
+
|
| 26 |
+
if host in YOUTUBE_HOSTS:
|
| 27 |
+
return SourceType.YOUTUBE
|
| 28 |
+
if "arxiv.org" in host or ARXIV_RE.search(clean_url):
|
| 29 |
+
return SourceType.ARXIV
|
| 30 |
+
raise ValueError("Could not detect source type. Use a YouTube URL, arXiv URL/ID, or PDF.")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def extract_arxiv_id(value: str) -> str:
|
| 34 |
+
match = ARXIV_RE.search(value.strip())
|
| 35 |
+
if not match:
|
| 36 |
+
raise ValueError("Could not find a valid arXiv ID.")
|
| 37 |
+
return match.group("id")
|
app/utils/zerogpu.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from collections.abc import Callable
|
| 3 |
+
from typing import TypeVar
|
| 4 |
+
|
| 5 |
+
from app.core.config import settings
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
F = TypeVar("F", bound=Callable)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def gpu(duration: int | None = None) -> Callable[[F], F]:
|
| 12 |
+
if not _should_use_zerogpu():
|
| 13 |
+
return _identity
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
import spaces
|
| 17 |
+
except ImportError:
|
| 18 |
+
return _identity
|
| 19 |
+
|
| 20 |
+
return spaces.GPU(duration=duration or settings.ZEROGPU_DURATION_SECONDS)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _identity(func: F) -> F:
|
| 24 |
+
return func
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _should_use_zerogpu() -> bool:
|
| 28 |
+
if os.getenv("DISABLE_ZEROGPU", "").lower() in {"1", "true", "yes"}:
|
| 29 |
+
return False
|
| 30 |
+
if os.getenv("ENABLE_ZEROGPU", "").lower() in {"1", "true", "yes"}:
|
| 31 |
+
return True
|
| 32 |
+
return bool(os.getenv("SPACE_ID"))
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def is_enabled() -> bool:
|
| 36 |
+
return _should_use_zerogpu()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "knowledgehub-ingestor"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "A Gradio document ingestion UI for PDFs, arXiv papers, and YouTube transcripts."
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"arxiv>=2.1.3",
|
| 9 |
+
"datasets>=5.0.0",
|
| 10 |
+
"gradio>=5.0.0",
|
| 11 |
+
"openai>=1.99.0",
|
| 12 |
+
"pydantic-settings>=2.4.0",
|
| 13 |
+
"pypdf>=4.3.1",
|
| 14 |
+
"python-dotenv>=1.0.1",
|
| 15 |
+
"qdrant-client>=1.12.1",
|
| 16 |
+
"requests>=2.32.3",
|
| 17 |
+
"sentence-transformers>=3.0.1",
|
| 18 |
+
"spaces",
|
| 19 |
+
"torchvision>=0.27.0",
|
| 20 |
+
"youtube-transcript-api>=0.6.2",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
[project.optional-dependencies]
|
| 24 |
+
dev = [
|
| 25 |
+
"ruff>=0.6.0",
|
| 26 |
+
"pytest>=8.3.2",
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
[tool.ruff]
|
| 30 |
+
line-length = 100
|
| 31 |
+
target-version = "py310"
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
arxiv>=2.1.3
|
| 2 |
+
gradio>=5.0.0
|
| 3 |
+
openai>=1.99.0
|
| 4 |
+
pydantic-settings>=2.4.0
|
| 5 |
+
pypdf>=4.3.1
|
| 6 |
+
python-dotenv>=1.0.1
|
| 7 |
+
qdrant-client>=1.12.1
|
| 8 |
+
requests>=2.32.3
|
| 9 |
+
sentence-transformers>=3.0.1
|
| 10 |
+
spaces
|
| 11 |
+
youtube-transcript-api>=0.6.2
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|