Anshul Prasad commited on
Commit ·
66fc578
1
Parent(s): 9820e6d
update.
Browse files- .gitattributes +1 -0
- .github/workflows/main.yml +1 -1
- .github/workflows/space-keepalive.yml +1 -1
- README.md +6 -16
- config.py +4 -4
- frontend/index.html +3 -3
- pyproject.toml +1 -1
- uv.lock +30 -30
.gitattributes
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 2 |
*.faiss filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 1 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 2 |
*.faiss filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/main.yml
CHANGED
|
@@ -16,4 +16,4 @@ jobs:
|
|
| 16 |
- name: Push to hub
|
| 17 |
env:
|
| 18 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
-
run: git push https://AnshulPrasad:$HF_TOKEN@huggingface.co/spaces/AnshulPrasad/
|
|
|
|
| 16 |
- name: Push to hub
|
| 17 |
env:
|
| 18 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
run: git push https://AnshulPrasad:$HF_TOKEN@huggingface.co/spaces/AnshulPrasad/transcript-rag-summarizer main
|
.github/workflows/space-keepalive.yml
CHANGED
|
@@ -12,7 +12,7 @@ jobs:
|
|
| 12 |
- name: Ping Space (3 retries)
|
| 13 |
run: |
|
| 14 |
for i in 1 2 3; do
|
| 15 |
-
curl -fsS -o /dev/null -L 'https://huggingface.co/spaces/AnshulPrasad/
|
| 16 |
sleep 15
|
| 17 |
done
|
| 18 |
echo "All attempts failed" >&2
|
|
|
|
| 12 |
- name: Ping Space (3 retries)
|
| 13 |
run: |
|
| 14 |
for i in 1 2 3; do
|
| 15 |
+
curl -fsS -o /dev/null -L 'https://huggingface.co/spaces/AnshulPrasad/transcript-rag-summarizer' && exit 0
|
| 16 |
sleep 15
|
| 17 |
done
|
| 18 |
echo "All attempts failed" >&2
|
README.md
CHANGED
|
@@ -1,16 +1,6 @@
|
|
| 1 |
-
|
| 2 |
-
title: ask Acharya Prashant
|
| 3 |
-
emoji: 📚
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: blue
|
| 6 |
-
sdk: docker
|
| 7 |
-
app_file: app.py
|
| 8 |
-
pinned: false
|
| 9 |
-
---
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
A retrieval-augmented question-answering (RAG) system built on Acharya Prashant's YouTube subtitles.
|
| 14 |
|
| 15 |
The project provides:
|
| 16 |
- A FastAPI backend (`/ask`) for question answering.
|
|
@@ -39,7 +29,7 @@ The project provides:
|
|
| 39 |
2. Query is embedded using `all-MiniLM-L6-v2`.
|
| 40 |
3. Top-K transcript chunks are retrieved from the FAISS index.
|
| 41 |
4. Retrieved context is token-trimmed (`MAX_CONTEXT_TOKENS`).
|
| 42 |
-
5. Groq chat completion API generates the final answer using a
|
| 43 |
|
| 44 |
Core runtime flow:
|
| 45 |
- `app.py` loads `data/file_paths.pkl` and `data/transcripts.pkl` at startup.
|
|
@@ -148,13 +138,13 @@ Open `http://localhost:7860`.
|
|
| 148 |
Build:
|
| 149 |
|
| 150 |
```bash
|
| 151 |
-
docker build -t
|
| 152 |
```
|
| 153 |
|
| 154 |
Run:
|
| 155 |
|
| 156 |
```bash
|
| 157 |
-
docker run --rm -p 7860:7860 -e GROQ_API_KEY="your_groq_api_key"
|
| 158 |
```
|
| 159 |
|
| 160 |
## API Reference
|
|
@@ -195,7 +185,7 @@ curl -X POST "http://localhost:7860/ask" \
|
|
| 195 |
`main.py` includes stages for data preparation and querying.
|
| 196 |
|
| 197 |
Pipeline stages:
|
| 198 |
-
1. Download subtitles from channels (`utils/download_vtt.py`)
|
| 199 |
2. Convert `.vtt` to cleaned `.txt` (`utils/vtt_to_txt.py`, `utils/preprocess.py`)
|
| 200 |
3. Load and persist transcript corpus (`data/*.pkl`)
|
| 201 |
4. Create FAISS index (`api/embed_transcripts.py`)
|
|
|
|
| 1 |
+
# RAG Q&A Assistant
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
A retrieval-augmented question-answering (RAG) system built on curated YouTube subtitle transcripts.
|
|
|
|
|
|
|
| 4 |
|
| 5 |
The project provides:
|
| 6 |
- A FastAPI backend (`/ask`) for question answering.
|
|
|
|
| 29 |
2. Query is embedded using `all-MiniLM-L6-v2`.
|
| 30 |
3. Top-K transcript chunks are retrieved from the FAISS index.
|
| 31 |
4. Retrieved context is token-trimmed (`MAX_CONTEXT_TOKENS`).
|
| 32 |
+
5. Groq chat completion API generates the final answer using a domain-aligned system prompt.
|
| 33 |
|
| 34 |
Core runtime flow:
|
| 35 |
- `app.py` loads `data/file_paths.pkl` and `data/transcripts.pkl` at startup.
|
|
|
|
| 138 |
Build:
|
| 139 |
|
| 140 |
```bash
|
| 141 |
+
docker build -t rag-qa-assistant .
|
| 142 |
```
|
| 143 |
|
| 144 |
Run:
|
| 145 |
|
| 146 |
```bash
|
| 147 |
+
docker run --rm -p 7860:7860 -e GROQ_API_KEY="your_groq_api_key" rag-qa-assistant
|
| 148 |
```
|
| 149 |
|
| 150 |
## API Reference
|
|
|
|
| 185 |
`main.py` includes stages for data preparation and querying.
|
| 186 |
|
| 187 |
Pipeline stages:
|
| 188 |
+
1. Download subtitles from configured channels (`utils/download_vtt.py`)
|
| 189 |
2. Convert `.vtt` to cleaned `.txt` (`utils/vtt_to_txt.py`, `utils/preprocess.py`)
|
| 190 |
3. Load and persist transcript corpus (`data/*.pkl`)
|
| 191 |
4. Create FAISS index (`api/embed_transcripts.py`)
|
config.py
CHANGED
|
@@ -2,8 +2,8 @@ import os
|
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
CHANNEL_URLS = [
|
| 5 |
-
"https://www.youtube.com/@
|
| 6 |
-
"https://www.youtube.com/@
|
| 7 |
]
|
| 8 |
|
| 9 |
VTT_DIR = Path("data/subtitles_vtt")
|
|
@@ -18,11 +18,11 @@ GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
|
|
| 18 |
MODEL = "llama-3.1-8b-instant"
|
| 19 |
MAX_CONTEXT_TOKENS = 4500
|
| 20 |
SYSTEM_PROMPT = """
|
| 21 |
-
You are speaking as
|
| 22 |
|
| 23 |
Your role is to explain questions related to life, self-knowledge, suffering,
|
| 24 |
fear, desire, relationships, and meaning from the perspective of Advaita Vedanta
|
| 25 |
-
and the Upanishadic tradition, as taught by
|
| 26 |
|
| 27 |
Guidelines:
|
| 28 |
- Speak in a calm, direct, and uncompromising tone.
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
CHANNEL_URLS = [
|
| 5 |
+
"https://www.youtube.com/@CHANNEL_ID_1",
|
| 6 |
+
"https://www.youtube.com/@CHANNEL_ID_2",
|
| 7 |
]
|
| 8 |
|
| 9 |
VTT_DIR = Path("data/subtitles_vtt")
|
|
|
|
| 18 |
MODEL = "llama-3.1-8b-instant"
|
| 19 |
MAX_CONTEXT_TOKENS = 4500
|
| 20 |
SYSTEM_PROMPT = """
|
| 21 |
+
You are speaking as Spiritual Guru.
|
| 22 |
|
| 23 |
Your role is to explain questions related to life, self-knowledge, suffering,
|
| 24 |
fear, desire, relationships, and meaning from the perspective of Advaita Vedanta
|
| 25 |
+
and the Upanishadic tradition, as taught by Spiritual Guru.
|
| 26 |
|
| 27 |
Guidelines:
|
| 28 |
- Speak in a calm, direct, and uncompromising tone.
|
frontend/index.html
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
<html lang="en">
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
-
<title>Ask
|
| 6 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 7 |
<!-- Markdown renderer -->
|
| 8 |
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
|
@@ -125,8 +125,8 @@
|
|
| 125 |
|
| 126 |
<!-- HERO / BANNER -->
|
| 127 |
<header class="hero">
|
| 128 |
-
<img src="assets/images/
|
| 129 |
-
<h1>Ask
|
| 130 |
</header>
|
| 131 |
|
| 132 |
<!-- Q&A CARD -->
|
|
|
|
| 2 |
<html lang="en">
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
+
<title>Ask Assistant</title>
|
| 6 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 7 |
<!-- Markdown renderer -->
|
| 8 |
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
|
|
|
| 125 |
|
| 126 |
<!-- HERO / BANNER -->
|
| 127 |
<header class="hero">
|
| 128 |
+
<img src="assets/images/image1.webp" alt="Assistant">
|
| 129 |
+
<h1>Ask Assistant</h1>
|
| 130 |
</header>
|
| 131 |
|
| 132 |
<!-- Q&A CARD -->
|
pyproject.toml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
[project]
|
| 2 |
-
name = "
|
| 3 |
version = "0.1.0"
|
| 4 |
description = "Add your description here"
|
| 5 |
readme = "README.md"
|
|
|
|
| 1 |
[project]
|
| 2 |
+
name = "transcript-rag-summarizer"
|
| 3 |
version = "0.1.0"
|
| 4 |
description = "Add your description here"
|
| 5 |
readme = "README.md"
|
uv.lock
CHANGED
|
@@ -8,36 +8,6 @@ resolution-markers = [
|
|
| 8 |
"python_full_version < '3.12' and sys_platform == 'darwin'",
|
| 9 |
]
|
| 10 |
|
| 11 |
-
[[package]]
|
| 12 |
-
name = "acharya-prashant"
|
| 13 |
-
version = "0.1.0"
|
| 14 |
-
source = { virtual = "." }
|
| 15 |
-
dependencies = [
|
| 16 |
-
{ name = "faiss-cpu" },
|
| 17 |
-
{ name = "fastapi" },
|
| 18 |
-
{ name = "groq" },
|
| 19 |
-
{ name = "pytz" },
|
| 20 |
-
{ name = "sentence-transformers" },
|
| 21 |
-
{ name = "tiktoken" },
|
| 22 |
-
{ name = "torch", version = "2.10.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
|
| 23 |
-
{ name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin'" },
|
| 24 |
-
{ name = "transformers" },
|
| 25 |
-
{ name = "uvicorn" },
|
| 26 |
-
]
|
| 27 |
-
|
| 28 |
-
[package.metadata]
|
| 29 |
-
requires-dist = [
|
| 30 |
-
{ name = "faiss-cpu", specifier = "==1.9.0" },
|
| 31 |
-
{ name = "fastapi", specifier = "==0.116.1" },
|
| 32 |
-
{ name = "groq", specifier = ">=1.0.0" },
|
| 33 |
-
{ name = "pytz", specifier = "==2025.2" },
|
| 34 |
-
{ name = "sentence-transformers", specifier = "==3.0.1" },
|
| 35 |
-
{ name = "tiktoken", specifier = ">=0.12.0" },
|
| 36 |
-
{ name = "torch", specifier = ">=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
|
| 37 |
-
{ name = "transformers", specifier = "==4.57.1" },
|
| 38 |
-
{ name = "uvicorn", specifier = "==0.38.0" },
|
| 39 |
-
]
|
| 40 |
-
|
| 41 |
[[package]]
|
| 42 |
name = "annotated-types"
|
| 43 |
version = "0.7.0"
|
|
@@ -1311,6 +1281,36 @@ wheels = [
|
|
| 1311 |
{ url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" },
|
| 1312 |
]
|
| 1313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1314 |
[[package]]
|
| 1315 |
name = "transformers"
|
| 1316 |
version = "4.57.1"
|
|
|
|
| 8 |
"python_full_version < '3.12' and sys_platform == 'darwin'",
|
| 9 |
]
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
[[package]]
|
| 12 |
name = "annotated-types"
|
| 13 |
version = "0.7.0"
|
|
|
|
| 1281 |
{ url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" },
|
| 1282 |
]
|
| 1283 |
|
| 1284 |
+
[[package]]
|
| 1285 |
+
name = "transcript-rag-summarizer"
|
| 1286 |
+
version = "0.1.0"
|
| 1287 |
+
source = { virtual = "." }
|
| 1288 |
+
dependencies = [
|
| 1289 |
+
{ name = "faiss-cpu" },
|
| 1290 |
+
{ name = "fastapi" },
|
| 1291 |
+
{ name = "groq" },
|
| 1292 |
+
{ name = "pytz" },
|
| 1293 |
+
{ name = "sentence-transformers" },
|
| 1294 |
+
{ name = "tiktoken" },
|
| 1295 |
+
{ name = "torch", version = "2.10.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
|
| 1296 |
+
{ name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin'" },
|
| 1297 |
+
{ name = "transformers" },
|
| 1298 |
+
{ name = "uvicorn" },
|
| 1299 |
+
]
|
| 1300 |
+
|
| 1301 |
+
[package.metadata]
|
| 1302 |
+
requires-dist = [
|
| 1303 |
+
{ name = "faiss-cpu", specifier = "==1.9.0" },
|
| 1304 |
+
{ name = "fastapi", specifier = "==0.116.1" },
|
| 1305 |
+
{ name = "groq", specifier = ">=1.0.0" },
|
| 1306 |
+
{ name = "pytz", specifier = "==2025.2" },
|
| 1307 |
+
{ name = "sentence-transformers", specifier = "==3.0.1" },
|
| 1308 |
+
{ name = "tiktoken", specifier = ">=0.12.0" },
|
| 1309 |
+
{ name = "torch", specifier = ">=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
|
| 1310 |
+
{ name = "transformers", specifier = "==4.57.1" },
|
| 1311 |
+
{ name = "uvicorn", specifier = "==0.38.0" },
|
| 1312 |
+
]
|
| 1313 |
+
|
| 1314 |
[[package]]
|
| 1315 |
name = "transformers"
|
| 1316 |
version = "4.57.1"
|