Commit ·
1c23b7c
1
Parent(s): 97acca0
update cursor
Browse files- .gitignore +13 -0
- Dockerfile +22 -0
- Feedback.csv +0 -0
- README.md +221 -0
- app/__init__.py +2 -0
- app/api.py +177 -0
- app/config.py +27 -0
- app/data_loader.py +19 -0
- app/embedding.py +28 -0
- app/preprocess.py +22 -0
- app/rag_service.py +135 -0
- app/sentiment.py +24 -0
- app/topics.py +22 -0
- app/vector_store.py +61 -0
- requirements.txt +17 -0
- run.py +12 -0
- scripts/__init__.py +2 -0
- scripts/precompute_index.py +21 -0
.gitignore
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
.pytest_cache/
|
| 4 |
+
.mypy_cache/
|
| 5 |
+
.vector_index/
|
| 6 |
+
.env
|
| 7 |
+
*.parquet
|
| 8 |
+
*.index
|
| 9 |
+
*.ipynb_checkpoints/
|
| 10 |
+
dist/
|
| 11 |
+
build/
|
| 12 |
+
*.egg-info/
|
| 13 |
+
|
Dockerfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1 \
|
| 6 |
+
HF_HUB_DISABLE_TELEMETRY=1
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
COPY requirements.txt ./
|
| 11 |
+
# Install Torch CPU wheels first to avoid heavy builds
|
| 12 |
+
RUN pip install --upgrade pip && \
|
| 13 |
+
pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu \
|
| 14 |
+
torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 && \
|
| 15 |
+
pip install --no-cache-dir -r requirements.txt --default-timeout=100
|
| 16 |
+
|
| 17 |
+
COPY . .
|
| 18 |
+
|
| 19 |
+
EXPOSE 8000
|
| 20 |
+
|
| 21 |
+
CMD ["python", "run.py"]
|
| 22 |
+
|
Feedback.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Feedback Analysis RAG Agent
|
| 2 |
+
|
| 3 |
+
An end-to-end system for analyzing citizen feedback with Retrieval-Augmented Generation (RAG). It ingests `Feedback.csv`, creates multilingual embeddings, builds a FAISS vector index, and exposes a FastAPI API for semantic search, topic clustering, and sentiment summaries. Designed to run locally or in containers, and to be deployable to Runpod.
|
| 4 |
+
|
| 5 |
+
### Features
|
| 6 |
+
- Multilingual ingestion (Hebrew supported) from `Feedback.csv`
|
| 7 |
+
- Preprocessing: optional normalization, language detection
|
| 8 |
+
- Embeddings: Sentence-Transformers (multilingual) + FAISS
|
| 9 |
+
- Retrieval: top-k semantic nearest neighbors with filters
|
| 10 |
+
- Summarization: LLM (OpenAI) if configured; fallback to extractive summary
|
| 11 |
+
- Supports Gemini (preferred) or OpenAI when API keys are provided
|
| 12 |
+
- Topics: k-means topic clustering over embeddings
|
| 13 |
+
- Sentiment: multilingual transformer pipeline
|
| 14 |
+
- FastAPI endpoints and a simple CLI
|
| 15 |
+
|
| 16 |
+
### Project layout
|
| 17 |
+
```
|
| 18 |
+
app/
|
| 19 |
+
api.py
|
| 20 |
+
config.py
|
| 21 |
+
data_loader.py
|
| 22 |
+
embedding.py
|
| 23 |
+
preprocess.py
|
| 24 |
+
rag_service.py
|
| 25 |
+
sentiment.py
|
| 26 |
+
topics.py
|
| 27 |
+
vector_store.py
|
| 28 |
+
run.py
|
| 29 |
+
requirements.txt
|
| 30 |
+
Dockerfile
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
### Quick start
|
| 34 |
+
1) Python 3.10+
|
| 35 |
+
2) Install:
|
| 36 |
+
```
|
| 37 |
+
python -m venv .venv && source .venv/bin/activate
|
| 38 |
+
pip install -r requirements.txt
|
| 39 |
+
```
|
| 40 |
+
3) Environment (optional):
|
| 41 |
+
```
|
| 42 |
+
export OPENAI_API_KEY=sk-...
|
| 43 |
+
export GEMINI_API_KEY=your_gemini_key
|
| 44 |
+
```
|
| 45 |
+
4) Run API:
|
| 46 |
+
```
|
| 47 |
+
python run.py
|
| 48 |
+
```
|
| 49 |
+
Open http://127.0.0.1:8000/docs
|
| 50 |
+
|
| 51 |
+
5) CLI example:
|
| 52 |
+
```
|
| 53 |
+
python -m app.rag_service --query "שיפור טופס" --top_k 5
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### Configuration
|
| 57 |
+
Environment variables:
|
| 58 |
+
- GEMINI_API_KEY: If set, RAG uses Gemini (preferred) for summaries
|
| 59 |
+
- OPENAI_API_KEY: If set, RAG can use OpenAI as a fallback
|
| 60 |
+
- EMBEDDING_MODEL: Sentence-Transformers model name (default: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
|
| 61 |
+
- VECTOR_INDEX_PATH: Path to persist FAISS index (default: ./.vector_index/faiss.index)
|
| 62 |
+
|
| 63 |
+
### Notes
|
| 64 |
+
- The first run will download models (embeddings, sentiment); ensure internet access.
|
| 65 |
+
- The system reads from `Feedback.csv` in the repo root. Update `app/data_loader.py` if your schema differs.
|
| 66 |
+
|
| 67 |
+
### Runpod
|
| 68 |
+
- This repo includes a `Dockerfile`. Build and push the image; configure your Runpod template to run `python run.py` and expose port 8000.
|
| 69 |
+
|
| 70 |
+
### Secrets hygiene
|
| 71 |
+
- Do not commit real secrets. Use environment variables or a local `.env` file.
|
| 72 |
+
- `.env` is gitignored by default via `.gitignore`.
|
| 73 |
+
- Rotate any keys that were ever shared publicly.
|
| 74 |
+
|
| 75 |
+
## Run on Runpod - Full guide
|
| 76 |
+
|
| 77 |
+
### 1) Build and push the container
|
| 78 |
+
- From project root:
|
| 79 |
+
```
|
| 80 |
+
docker build -t YOUR_DOCKERHUB_USER/feedback-rag:latest .
|
| 81 |
+
docker login
|
| 82 |
+
docker push YOUR_DOCKERHUB_USER/feedback-rag:latest
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### 2) Prepare environment variables (no secrets in git)
|
| 86 |
+
- You will set secrets within Runpod, not in code:
|
| 87 |
+
- Required:
|
| 88 |
+
- `GEMINI_API_KEY` = your Gemini key
|
| 89 |
+
- Optional:
|
| 90 |
+
- `OPENAI_API_KEY` = OpenAI fallback key
|
| 91 |
+
- `CSV_PATH` = path to your CSV if not the default `Feedback.csv`
|
| 92 |
+
- `VECTOR_INDEX_PATH` and `VECTOR_METADATA_PATH` if you change mount/paths
|
| 93 |
+
|
| 94 |
+
### 3) Create a Runpod Template (Serverless HTTP recommended)
|
| 95 |
+
- In Runpod Console → Templates → Create Template
|
| 96 |
+
- Fields:
|
| 97 |
+
- Container Image: `YOUR_DOCKERHUB_USER/feedback-rag:latest` (if you want you can use mine: `galbendavids/feedback-rag:latest`)
|
| 98 |
+
- Container Port: `8000`
|
| 99 |
+
- Command: `python run.py`
|
| 100 |
+
- Environment Variables:
|
| 101 |
+
- `GEMINI_API_KEY=your_key`
|
| 102 |
+
- (optional) `OPENAI_API_KEY=sk-...`
|
| 103 |
+
- (optional) `CSV_PATH=/workspace/Feedback.csv`
|
| 104 |
+
- (optional) `VECTOR_INDEX_PATH=/workspace/.vector_index/faiss.index`
|
| 105 |
+
- (optional) `VECTOR_METADATA_PATH=/workspace/.vector_index/meta.parquet`
|
| 106 |
+
- Volumes (recommended to persist the FAISS index):
|
| 107 |
+
- Create a volume, mount it at `/workspace/.vector_index`
|
| 108 |
+
- Make sure your `VECTOR_*` env vars point to that mount path if changed
|
| 109 |
+
|
| 110 |
+
### 4) Deploy a Serverless Endpoint
|
| 111 |
+
- Create Endpoint from the template (Serverless)
|
| 112 |
+
- Choose region and CPU (CPU is sufficient)
|
| 113 |
+
- Wait until status is Running and an endpoint URL is provided
|
| 114 |
+
|
| 115 |
+
### 5) Upload or point to your CSV
|
| 116 |
+
- Option A (bundled): Keep `Feedback.csv` in the image (already in repo root)
|
| 117 |
+
- Option B (mounted): Upload to a mounted volume and set `CSV_PATH` accordingly
|
| 118 |
+
|
| 119 |
+
### 6) First-time ingestion (build the vector index)
|
| 120 |
+
- Trigger ingestion once to build and persist the FAISS index:
|
| 121 |
+
```
|
| 122 |
+
curl -X POST {YOUR_ENDPOINT_URL}/ingest
|
| 123 |
+
```
|
| 124 |
+
- On first run, models download and embeddings are computed; allow a few minutes
|
| 125 |
+
- The index will be stored under `.vector_index` (persist if using a volume)
|
| 126 |
+
|
| 127 |
+
### 7) Test the API
|
| 128 |
+
- Health:
|
| 129 |
+
```
|
| 130 |
+
curl -s {YOUR_ENDPOINT_URL}/health
|
| 131 |
+
```
|
| 132 |
+
- Query:
|
| 133 |
+
```
|
| 134 |
+
curl -X POST {YOUR_ENDPOINT_URL}/query \
|
| 135 |
+
-H "Content-Type: application/json" \
|
| 136 |
+
-d '{"query":"שיפור טופס", "top_k": 5}' \
|
| 137 |
+
{YOUR_ENDPOINT_URL}/query
|
| 138 |
+
```
|
| 139 |
+
- Topics:
|
| 140 |
+
```
|
| 141 |
+
curl -s "{YOUR_ENDPOINT_URL}/topics?num_topics=8"
|
| 142 |
+
```
|
| 143 |
+
- Sentiment (first N rows):
|
| 144 |
+
```
|
| 145 |
+
curl -s "{YOUR_ENDPOINT_URL}/sentiment?limit=100"
|
| 146 |
+
```
|
| 147 |
+
- Interactive docs (Swagger UI):
|
| 148 |
+
- Open `{YOUR_ENDPOINT_URL}/docs` in your browser
|
| 149 |
+
|
| 150 |
+
### 8) Using Dedicated Pods (alternative)
|
| 151 |
+
- Launch a Dedicated Pod from the template
|
| 152 |
+
- Ensure command `python run.py` and port `8000`
|
| 153 |
+
- Use the Pod’s public endpoint to access `/health`, `/ingest`, `/query`, etc.
|
| 154 |
+
|
| 155 |
+
### 9) Troubleshooting
|
| 156 |
+
- 404/connection:
|
| 157 |
+
- Endpoint not Running yet or wrong port; port must be `8000`
|
| 158 |
+
- Slow initial response:
|
| 159 |
+
- First-time model downloads are expected; subsequent calls are faster
|
| 160 |
+
- No/few results:
|
| 161 |
+
- Ensure you POSTed `/ingest` first and that your CSV has the `Text` column
|
| 162 |
+
- Index not persisted:
|
| 163 |
+
- Mount a volume at `/workspace/.vector_index` and set `VECTOR_*` paths
|
| 164 |
+
|
| 165 |
+
### 10) Optional: Pre-cache models to speed cold starts
|
| 166 |
+
- You can pre-bake model weights in the image by adding to your `Dockerfile`:
|
| 167 |
+
```
|
| 168 |
+
# Optional: pre-download models during build to reduce cold start time
|
| 169 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')"
|
| 170 |
+
RUN python -c "from transformers import pipeline; pipeline('sentiment-analysis', model='cardiffnlp/twitter-xlm-roberta-base-sentiment')"
|
| 171 |
+
```
|
| 172 |
+
- Rebuild and push the image after adding these lines.
|
| 173 |
+
|
| 174 |
+
## Offline precompute (embed the DB locally for fast startup)
|
| 175 |
+
|
| 176 |
+
If you want the API to start fast on Runpod without running `/ingest` there, precompute the vector index locally:
|
| 177 |
+
|
| 178 |
+
1) Create venv and install deps:
|
| 179 |
+
```
|
| 180 |
+
python -m venv .venv && source .venv/bin/activate
|
| 181 |
+
pip install -r requirements.txt
|
| 182 |
+
```
|
| 183 |
+
2) Ensure `Feedback.csv` exists at repo root (or set `CSV_PATH`).
|
| 184 |
+
3) Run the offline precompute script:
|
| 185 |
+
```
|
| 186 |
+
python scripts/precompute_index.py
|
| 187 |
+
```
|
| 188 |
+
This writes:
|
| 189 |
+
- `.vector_index/faiss.index`
|
| 190 |
+
- `.vector_index/meta.parquet`
|
| 191 |
+
|
| 192 |
+
4) Option A: Commit the index (makes startup fastest)
|
| 193 |
+
- By default `.vector_index/` is in `.gitignore`. To commit it, you can temporarily remove that entry and run:
|
| 194 |
+
```
|
| 195 |
+
git add .vector_index/faiss.index .vector_index/meta.parquet
|
| 196 |
+
git commit -m "Add precomputed FAISS index"
|
| 197 |
+
git push
|
| 198 |
+
```
|
| 199 |
+
(Note: repo size will increase; acceptable for small indices.)
|
| 200 |
+
|
| 201 |
+
5) Option B: Keep index uncommitted; mount it on Runpod
|
| 202 |
+
- Upload the `.vector_index/` folder to a Runpod volume mounted at `/workspace/.vector_index`
|
| 203 |
+
- Set env vars if you changed paths:
|
| 204 |
+
- `VECTOR_INDEX_PATH=/workspace/.vector_index/faiss.index`
|
| 205 |
+
- `VECTOR_METADATA_PATH=/workspace/.vector_index/meta.parquet`
|
| 206 |
+
|
| 207 |
+
With either option, the API will be immediately queryable without calling `/ingest`.
|
| 208 |
+
|
| 209 |
+
### When your data changes
|
| 210 |
+
- If you update `Feedback.csv` (or change `CSV_PATH` to a new dataset), you must rerun:
|
| 211 |
+
```
|
| 212 |
+
uv run -m scripts.precompute_index
|
| 213 |
+
```
|
| 214 |
+
- Then redeploy (bake new files into the image or upload to your Runpod volume) so the server uses the fresh index.
|
| 215 |
+
|
| 216 |
+
### Adding new feedback entries
|
| 217 |
+
- You can add rows to `Feedback.csv` and either:
|
| 218 |
+
- Rebuild the entire index (simple, safest):
|
| 219 |
+
- `uv run -m scripts.precompute_index`
|
| 220 |
+
- Or implement an incremental append (advanced): embed only the new rows with `EmbeddingModel.encode(...)`, call `FaissVectorStore.load(...)`, then `store.add(new_vectors, new_metadata)` and `store.save(...)`. This keeps the same architecture and avoids re-embedding all previous data.
|
| 221 |
+
|
app/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Makes `app` a package so imports like `from app.rag_service import RAGService` work.
|
| 2 |
+
|
app/api.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List, Optional, Dict, Any
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from fastapi import FastAPI, Query
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
from .config import settings
|
| 11 |
+
from .data_loader import load_feedback
|
| 12 |
+
from .embedding import EmbeddingModel
|
| 13 |
+
from .rag_service import RAGService
|
| 14 |
+
from .sentiment import analyze_sentiments
|
| 15 |
+
from .topics import kmeans_topics
|
| 16 |
+
from .vector_store import FaissVectorStore
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
app = FastAPI(title="Feedback Analysis RAG Agent", version="1.0.0", default_response_class=None)
|
| 20 |
+
svc = RAGService()
|
| 21 |
+
embedder = svc.embedder
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class QueryRequest(BaseModel):
|
| 25 |
+
query: str
|
| 26 |
+
top_k: int = 5
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class QueryResponse(BaseModel):
|
| 30 |
+
query: str
|
| 31 |
+
summary: Optional[str]
|
| 32 |
+
results: List[Dict[str, Any]]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@app.get("/health")
|
| 36 |
+
def health() -> Dict[str, str]:
|
| 37 |
+
return {"status": "ok"}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@app.post("/ingest")
|
| 41 |
+
def ingest() -> Dict[str, Any]:
|
| 42 |
+
"""Build the vector index from Feedback.csv"""
|
| 43 |
+
try:
|
| 44 |
+
svc.ingest()
|
| 45 |
+
return {"status": "ingested", "message": "Vector index built successfully"}
|
| 46 |
+
except FileNotFoundError as e:
|
| 47 |
+
return {"status": "error", "message": f"CSV file not found: {str(e)}"}
|
| 48 |
+
except Exception as e:
|
| 49 |
+
return {"status": "error", "message": f"Ingestion failed: {str(e)}"}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@app.post("/query", response_model=QueryResponse)
|
| 53 |
+
def query(req: QueryRequest) -> QueryResponse:
|
| 54 |
+
"""Free-form question answering over feedback data."""
|
| 55 |
+
try:
|
| 56 |
+
out = svc.query(req.query, top_k=req.top_k)
|
| 57 |
+
return QueryResponse(
|
| 58 |
+
query=out.query,
|
| 59 |
+
summary=out.summary,
|
| 60 |
+
results=[
|
| 61 |
+
{
|
| 62 |
+
"score": r.score,
|
| 63 |
+
"service": r.row.get(settings.service_column, ""),
|
| 64 |
+
"level": r.row.get(settings.level_column, ""),
|
| 65 |
+
"text": r.row.get(settings.text_column, ""),
|
| 66 |
+
}
|
| 67 |
+
for r in out.results
|
| 68 |
+
],
|
| 69 |
+
)
|
| 70 |
+
except FileNotFoundError:
|
| 71 |
+
return QueryResponse(
|
| 72 |
+
query=req.query,
|
| 73 |
+
summary="Error: Vector index not found. Please run /ingest first.",
|
| 74 |
+
results=[]
|
| 75 |
+
)
|
| 76 |
+
except Exception as e:
|
| 77 |
+
return QueryResponse(
|
| 78 |
+
query=req.query,
|
| 79 |
+
summary=f"Error: {str(e)}",
|
| 80 |
+
results=[]
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@app.get("/topics")
|
| 85 |
+
def topics(num_topics: int = Query(5, ge=2, le=50)) -> Dict[str, Any]:
|
| 86 |
+
"""Extract main topics from feedback. Returns topics with summaries."""
|
| 87 |
+
try:
|
| 88 |
+
# Load embeddings from store
|
| 89 |
+
store = FaissVectorStore.load(settings.vector_index_path, settings.vector_metadata_path)
|
| 90 |
+
# FAISS does not expose vectors, so recompute for this endpoint
|
| 91 |
+
df = load_feedback()
|
| 92 |
+
texts = df[settings.text_column].astype(str).tolist()
|
| 93 |
+
if not texts:
|
| 94 |
+
return {"num_topics": 0, "topics": {}, "error": "No feedback data found"}
|
| 95 |
+
|
| 96 |
+
embeddings = embedder.encode(texts)
|
| 97 |
+
res = kmeans_topics(embeddings, num_topics=num_topics)
|
| 98 |
+
|
| 99 |
+
# Group texts by topic
|
| 100 |
+
topics_out: Dict[int, List[str]] = {}
|
| 101 |
+
for label, text in zip(res.labels, texts):
|
| 102 |
+
topics_out.setdefault(int(label), []).append(text)
|
| 103 |
+
|
| 104 |
+
# Generate topic names/summaries using LLM if available
|
| 105 |
+
topic_summaries: Dict[int, str] = {}
|
| 106 |
+
for topic_id, topic_texts in topics_out.items():
|
| 107 |
+
# Take sample texts for summary
|
| 108 |
+
sample_texts = topic_texts[:10] if len(topic_texts) > 10 else topic_texts
|
| 109 |
+
sample_str = "\n".join(f"- {t[:200]}" for t in sample_texts[:5])
|
| 110 |
+
|
| 111 |
+
prompt = (
|
| 112 |
+
"Based on the following citizen feedback examples, provide a short topic name (2-4 words) "
|
| 113 |
+
"in Hebrew that describes what users are talking about. "
|
| 114 |
+
"Return ONLY the topic name, nothing else.\n\n"
|
| 115 |
+
f"Examples:\n{sample_str}\n\nTopic name:"
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
topic_name = f"נושא {topic_id + 1}" # Default fallback
|
| 119 |
+
|
| 120 |
+
# Try Gemini first
|
| 121 |
+
if settings.gemini_api_key:
|
| 122 |
+
try:
|
| 123 |
+
import google.generativeai as genai
|
| 124 |
+
genai.configure(api_key=settings.gemini_api_key)
|
| 125 |
+
model = genai.GenerativeModel("gemini-1.5-flash")
|
| 126 |
+
resp = model.generate_content(prompt)
|
| 127 |
+
text = getattr(resp, "text", None)
|
| 128 |
+
if isinstance(text, str) and text.strip():
|
| 129 |
+
topic_name = text.strip()
|
| 130 |
+
except Exception:
|
| 131 |
+
pass
|
| 132 |
+
|
| 133 |
+
# Fallback to OpenAI
|
| 134 |
+
if topic_name.startswith("נושא") and settings.openai_api_key:
|
| 135 |
+
try:
|
| 136 |
+
from openai import OpenAI
|
| 137 |
+
client = OpenAI(api_key=settings.openai_api_key)
|
| 138 |
+
resp = client.chat.completions.create(
|
| 139 |
+
model="gpt-4o-mini",
|
| 140 |
+
messages=[{"role": "user", "content": prompt}],
|
| 141 |
+
temperature=0.3,
|
| 142 |
+
max_tokens=20,
|
| 143 |
+
)
|
| 144 |
+
if resp.choices[0].message.content:
|
| 145 |
+
topic_name = resp.choices[0].message.content.strip()
|
| 146 |
+
except Exception:
|
| 147 |
+
pass
|
| 148 |
+
|
| 149 |
+
topic_summaries[topic_id] = topic_name
|
| 150 |
+
|
| 151 |
+
# Format response with topic names
|
| 152 |
+
formatted_topics: Dict[str, Any] = {}
|
| 153 |
+
for topic_id, topic_texts in topics_out.items():
|
| 154 |
+
formatted_topics[str(topic_id)] = {
|
| 155 |
+
"name": topic_summaries.get(topic_id, f"נושא {topic_id + 1}"),
|
| 156 |
+
"count": len(topic_texts),
|
| 157 |
+
"examples": topic_texts[:5] # First 5 examples
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
return {
|
| 161 |
+
"num_topics": num_topics,
|
| 162 |
+
"topics": formatted_topics,
|
| 163 |
+
"total_feedback": len(texts)
|
| 164 |
+
}
|
| 165 |
+
except FileNotFoundError:
|
| 166 |
+
return {"error": "Vector index not found. Please run /ingest first.", "num_topics": 0, "topics": {}}
|
| 167 |
+
except Exception as e:
|
| 168 |
+
return {"error": str(e), "num_topics": 0, "topics": {}}
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
@app.get("/sentiment")
|
| 172 |
+
def sentiment(limit: int = Query(100, ge=1, le=2000)) -> Dict[str, Any]:
|
| 173 |
+
df = load_feedback().head(limit)
|
| 174 |
+
texts = df[settings.text_column].astype(str).tolist()
|
| 175 |
+
out = analyze_sentiments(texts)
|
| 176 |
+
return {"count": len(out), "results": out}
|
| 177 |
+
|
app/config.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from dotenv import load_dotenv # type: ignore
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Load .env if present (kept out of git via .gitignore)
|
| 7 |
+
load_dotenv(override=False)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class Settings:
|
| 12 |
+
openai_api_key: str | None = os.getenv("OPENAI_API_KEY")
|
| 13 |
+
gemini_api_key: str | None = os.getenv("GEMINI_API_KEY")
|
| 14 |
+
embedding_model_name: str = os.getenv(
|
| 15 |
+
"EMBEDDING_MODEL",
|
| 16 |
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
| 17 |
+
)
|
| 18 |
+
vector_index_path: str = os.getenv("VECTOR_INDEX_PATH", ".vector_index/faiss.index")
|
| 19 |
+
vector_metadata_path: str = os.getenv("VECTOR_METADATA_PATH", ".vector_index/meta.parquet")
|
| 20 |
+
csv_path: str = os.getenv("CSV_PATH", "Feedback.csv")
|
| 21 |
+
text_column: str = os.getenv("TEXT_COLUMN", "Text")
|
| 22 |
+
service_column: str = os.getenv("SERVICE_COLUMN", "ServiceName")
|
| 23 |
+
level_column: str = os.getenv("LEVEL_COLUMN", "Level")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
settings = Settings()
|
| 27 |
+
|
app/data_loader.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from .config import settings
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def load_feedback(csv_path: str | None = None) -> pd.DataFrame:
|
| 8 |
+
path = csv_path or settings.csv_path
|
| 9 |
+
df = pd.read_csv(path)
|
| 10 |
+
# Basic normalization of expected columns if present
|
| 11 |
+
expected = ["ID", "ServiceName", "Level", "Text"]
|
| 12 |
+
missing = [c for c in expected if c not in df.columns]
|
| 13 |
+
if missing:
|
| 14 |
+
raise ValueError(f"Missing expected columns in CSV: {missing}")
|
| 15 |
+
# Drop rows with empty text
|
| 16 |
+
df = df[df["Text"].astype(str).str.strip().ne("")].copy()
|
| 17 |
+
df.reset_index(drop=True, inplace=True)
|
| 18 |
+
return df
|
| 19 |
+
|
app/embedding.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Iterable, List
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
from sentence_transformers import SentenceTransformer # type: ignore
|
| 7 |
+
|
| 8 |
+
from .config import settings
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class EmbeddingModel:
|
| 12 |
+
def __init__(self, model_name: str | None = None) -> None:
|
| 13 |
+
self.model_name = model_name or settings.embedding_model_name
|
| 14 |
+
self.model = SentenceTransformer(self.model_name)
|
| 15 |
+
|
| 16 |
+
def encode(self, texts: Iterable[str], batch_size: int = 32) -> np.ndarray:
|
| 17 |
+
embeddings = self.model.encode(
|
| 18 |
+
list(texts),
|
| 19 |
+
batch_size=batch_size,
|
| 20 |
+
show_progress_bar=True,
|
| 21 |
+
convert_to_numpy=True,
|
| 22 |
+
normalize_embeddings=True,
|
| 23 |
+
)
|
| 24 |
+
return embeddings
|
| 25 |
+
|
| 26 |
+
def encode_single(self, text: str) -> np.ndarray:
|
| 27 |
+
return self.encode([text])[0]
|
| 28 |
+
|
app/preprocess.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from langdetect import detect, DetectorFactory # type: ignore
|
| 4 |
+
|
| 5 |
+
DetectorFactory.seed = 42
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def detect_language(text: str) -> str:
|
| 9 |
+
try:
|
| 10 |
+
return detect(text)
|
| 11 |
+
except Exception:
|
| 12 |
+
return "unknown"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def normalize_text(text: str) -> str:
|
| 16 |
+
# Minimal normalization; keep non-latin scripts (Hebrew)
|
| 17 |
+
return " ".join(str(text).split())
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def preprocess_text(text: str) -> str:
|
| 21 |
+
return normalize_text(text)
|
| 22 |
+
|
app/rag_service.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import List, Optional, Dict
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
from .config import settings
|
| 11 |
+
from .data_loader import load_feedback
|
| 12 |
+
from .embedding import EmbeddingModel
|
| 13 |
+
from .preprocess import preprocess_text
|
| 14 |
+
from .vector_store import FaissVectorStore, SearchResult
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from openai import OpenAI # type: ignore
|
| 19 |
+
except Exception: # pragma: no cover - optional
|
| 20 |
+
OpenAI = None # type: ignore
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
import google.generativeai as genai # type: ignore
|
| 24 |
+
except Exception: # pragma: no cover - optional
|
| 25 |
+
genai = None # type: ignore
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass
|
| 29 |
+
class RetrievalOutput:
|
| 30 |
+
query: str
|
| 31 |
+
results: List[SearchResult]
|
| 32 |
+
summary: Optional[str]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class RAGService:
|
| 36 |
+
def __init__(self) -> None:
|
| 37 |
+
self.embedder = EmbeddingModel()
|
| 38 |
+
self.store: Optional[FaissVectorStore] = None
|
| 39 |
+
|
| 40 |
+
def ingest(self, df: Optional[pd.DataFrame] = None) -> None:
|
| 41 |
+
data = df if df is not None else load_feedback()
|
| 42 |
+
texts = [preprocess_text(t) for t in data[settings.text_column].astype(str).tolist()]
|
| 43 |
+
vectors = self.embedder.encode(texts)
|
| 44 |
+
|
| 45 |
+
store = FaissVectorStore(dim=vectors.shape[1])
|
| 46 |
+
store.add(vectors.astype(np.float32), data[[settings.text_column, settings.service_column, settings.level_column]])
|
| 47 |
+
store.save(settings.vector_index_path, settings.vector_metadata_path)
|
| 48 |
+
self.store = store
|
| 49 |
+
|
| 50 |
+
def _ensure_store(self) -> None:
|
| 51 |
+
if self.store is None:
|
| 52 |
+
import os
|
| 53 |
+
if not os.path.exists(settings.vector_index_path):
|
| 54 |
+
raise FileNotFoundError(
|
| 55 |
+
f"Vector index not found at {settings.vector_index_path}. "
|
| 56 |
+
"Please run /ingest endpoint first or precompute the index."
|
| 57 |
+
)
|
| 58 |
+
self.store = FaissVectorStore.load(settings.vector_index_path, settings.vector_metadata_path)
|
| 59 |
+
|
| 60 |
+
def retrieve(self, query: str, top_k: int = 5) -> List[SearchResult]:
|
| 61 |
+
self._ensure_store()
|
| 62 |
+
assert self.store is not None
|
| 63 |
+
q_vec = self.embedder.encode_single(preprocess_text(query))
|
| 64 |
+
results = self.store.search(q_vec, top_k=top_k)
|
| 65 |
+
return results
|
| 66 |
+
|
| 67 |
+
def summarize(self, query: str, contexts: List[str]) -> Optional[str]:
|
| 68 |
+
if not contexts:
|
| 69 |
+
return None
|
| 70 |
+
joined = "\n".join(f"- {c}" for c in contexts[:10])
|
| 71 |
+
# Detect if query is in Hebrew
|
| 72 |
+
is_hebrew = any('\u0590' <= char <= '\u05FF' for char in query)
|
| 73 |
+
lang_instruction = "ענה בעברית" if is_hebrew else "Answer in the language of the query"
|
| 74 |
+
|
| 75 |
+
prompt = (
|
| 76 |
+
f"You are a government digital services assistant. Based on the following citizen feedback snippets, "
|
| 77 |
+
f"write a concise summary (max 100 words) highlighting key issues and suggestions. "
|
| 78 |
+
f"{lang_instruction}.\n\n"
|
| 79 |
+
f"Query:\n{query}\n\nFeedback:\n{joined}\n\nSummary:"
|
| 80 |
+
)
|
| 81 |
+
# Prefer Gemini if configured
|
| 82 |
+
if settings.gemini_api_key and genai is not None:
|
| 83 |
+
try:
|
| 84 |
+
genai.configure(api_key=settings.gemini_api_key)
|
| 85 |
+
model = genai.GenerativeModel("gemini-1.5-flash")
|
| 86 |
+
resp = model.generate_content(prompt)
|
| 87 |
+
text = getattr(resp, "text", None)
|
| 88 |
+
if isinstance(text, str) and text.strip():
|
| 89 |
+
return text.strip()
|
| 90 |
+
except Exception:
|
| 91 |
+
pass
|
| 92 |
+
# Fallback to OpenAI if available
|
| 93 |
+
if settings.openai_api_key and OpenAI is not None:
|
| 94 |
+
client = OpenAI(api_key=settings.openai_api_key)
|
| 95 |
+
try:
|
| 96 |
+
resp = client.chat.completions.create(
|
| 97 |
+
model="gpt-4o-mini",
|
| 98 |
+
messages=[{"role": "user", "content": prompt}],
|
| 99 |
+
temperature=0.2,
|
| 100 |
+
max_tokens=200,
|
| 101 |
+
)
|
| 102 |
+
return resp.choices[0].message.content
|
| 103 |
+
except Exception:
|
| 104 |
+
pass
|
| 105 |
+
# Fallback: simple extractive "summary"
|
| 106 |
+
return " ".join(contexts[:3])
|
| 107 |
+
|
| 108 |
+
def query(self, query: str, top_k: int = 5) -> RetrievalOutput:
|
| 109 |
+
results = self.retrieve(query, top_k=top_k)
|
| 110 |
+
contexts = [r.row[settings.text_column] for r in results]
|
| 111 |
+
summary = self.summarize(query, contexts)
|
| 112 |
+
return RetrievalOutput(query=query, results=results, summary=summary)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def main() -> None:
|
| 116 |
+
parser = argparse.ArgumentParser()
|
| 117 |
+
parser.add_argument("--ingest", action="store_true", help="Ingest CSV and build index")
|
| 118 |
+
parser.add_argument("--query", type=str, default=None, help="Run a semantic query")
|
| 119 |
+
parser.add_argument("--top_k", type=int, default=5, help="Top K results")
|
| 120 |
+
args = parser.parse_args()
|
| 121 |
+
|
| 122 |
+
svc = RAGService()
|
| 123 |
+
if args.ingest:
|
| 124 |
+
svc.ingest()
|
| 125 |
+
print("Ingest completed.")
|
| 126 |
+
if args.query:
|
| 127 |
+
out = svc.query(args.query, top_k=args.top_k)
|
| 128 |
+
print("Summary:", out.summary)
|
| 129 |
+
for r in out.results:
|
| 130 |
+
print(f"[{r.score:.3f}] {r.row.get('ServiceName','')} | {r.row.get('Text','')[:200]}")
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
main()
|
| 135 |
+
|
app/sentiment.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
|
| 6 |
+
from transformers import pipeline # type: ignore
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@lru_cache(maxsize=1)
|
| 10 |
+
def get_sentiment_pipeline():
|
| 11 |
+
# Multilingual sentiment model
|
| 12 |
+
return pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def analyze_sentiments(texts: List[str]) -> List[Dict[str, float | str]]:
|
| 16 |
+
clf = get_sentiment_pipeline()
|
| 17 |
+
outputs = clf(texts, truncation=True)
|
| 18 |
+
results: List[Dict[str, float | str]] = []
|
| 19 |
+
for out in outputs:
|
| 20 |
+
label = out.get("label", "")
|
| 21 |
+
score = float(out.get("score", 0.0))
|
| 22 |
+
results.append({"label": label, "score": score})
|
| 23 |
+
return results
|
| 24 |
+
|
app/topics.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from sklearn.cluster import KMeans # type: ignore
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class TopicResult:
|
| 12 |
+
labels: List[int]
|
| 13 |
+
centroids: np.ndarray
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def kmeans_topics(embeddings: np.ndarray, num_topics: int = 8, seed: int = 42) -> TopicResult:
|
| 17 |
+
if len(embeddings) == 0:
|
| 18 |
+
return TopicResult(labels=[], centroids=np.empty((0, embeddings.shape[1])))
|
| 19 |
+
km = KMeans(n_clusters=num_topics, random_state=seed, n_init="auto")
|
| 20 |
+
labels = km.fit_predict(embeddings)
|
| 21 |
+
return TopicResult(labels=list(map(int, labels)), centroids=km.cluster_centers_)
|
| 22 |
+
|
app/vector_store.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import List, Tuple, Optional
|
| 6 |
+
|
| 7 |
+
import faiss # type: ignore
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
|
| 11 |
+
from .config import settings
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class SearchResult:
|
| 16 |
+
index: int
|
| 17 |
+
score: float
|
| 18 |
+
row: pd.Series
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class FaissVectorStore:
|
| 22 |
+
def __init__(self, dim: int) -> None:
|
| 23 |
+
self.dim = dim
|
| 24 |
+
self.index = faiss.IndexFlatIP(dim)
|
| 25 |
+
self.metadata: Optional[pd.DataFrame] = None
|
| 26 |
+
|
| 27 |
+
def add(self, vectors: np.ndarray, metadata: pd.DataFrame) -> None:
|
| 28 |
+
if vectors.dtype != np.float32:
|
| 29 |
+
vectors = vectors.astype(np.float32)
|
| 30 |
+
if self.metadata is None:
|
| 31 |
+
self.metadata = metadata.reset_index(drop=True)
|
| 32 |
+
else:
|
| 33 |
+
self.metadata = pd.concat([self.metadata, metadata], ignore_index=True)
|
| 34 |
+
self.index.add(vectors)
|
| 35 |
+
|
| 36 |
+
def search(self, query_vector: np.ndarray, top_k: int = 5) -> List[SearchResult]:
|
| 37 |
+
q = query_vector.astype(np.float32).reshape(1, -1)
|
| 38 |
+
scores, idxs = self.index.search(q, top_k)
|
| 39 |
+
results: List[SearchResult] = []
|
| 40 |
+
for score, idx in zip(scores[0], idxs[0]):
|
| 41 |
+
if idx < 0 or self.metadata is None:
|
| 42 |
+
continue
|
| 43 |
+
results.append(SearchResult(index=int(idx), score=float(score), row=self.metadata.iloc[int(idx)]))
|
| 44 |
+
return results
|
| 45 |
+
|
| 46 |
+
def save(self, vector_path: str, meta_path: str) -> None:
|
| 47 |
+
os.makedirs(os.path.dirname(vector_path), exist_ok=True)
|
| 48 |
+
faiss.write_index(self.index, vector_path)
|
| 49 |
+
if self.metadata is not None:
|
| 50 |
+
self.metadata.to_parquet(meta_path, index=False)
|
| 51 |
+
|
| 52 |
+
@classmethod
|
| 53 |
+
def load(cls, vector_path: str, meta_path: str) -> "FaissVectorStore":
|
| 54 |
+
index = faiss.read_index(vector_path)
|
| 55 |
+
dim = index.d
|
| 56 |
+
store = cls(dim=dim)
|
| 57 |
+
store.index = index
|
| 58 |
+
if os.path.exists(meta_path):
|
| 59 |
+
store.metadata = pd.read_parquet(meta_path)
|
| 60 |
+
return store
|
| 61 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.5
|
| 2 |
+
uvicorn[standard]==0.32.0
|
| 3 |
+
pandas==2.2.3
|
| 4 |
+
numpy==1.26.4
|
| 5 |
+
scikit-learn==1.5.2
|
| 6 |
+
faiss-cpu==1.8.0.post1
|
| 7 |
+
sentence-transformers==3.1.1
|
| 8 |
+
transformers==4.45.2
|
| 9 |
+
torch==2.4.1
|
| 10 |
+
langdetect==1.0.9
|
| 11 |
+
openai==1.52.2
|
| 12 |
+
python-dotenv==1.0.1
|
| 13 |
+
pydantic==2.9.2
|
| 14 |
+
orjson==3.10.7
|
| 15 |
+
google-generativeai==0.6.0
|
| 16 |
+
pyarrow==14.0.2
|
| 17 |
+
|
run.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import uvicorn # type: ignore
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def main() -> None:
|
| 7 |
+
uvicorn.run("app.api:app", host="0.0.0.0", port=8000, reload=False)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
main()
|
| 12 |
+
|
scripts/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Optional: mark `scripts` as a package to allow module-style execution.
|
| 2 |
+
|
scripts/precompute_index.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from app.rag_service import RAGService
|
| 7 |
+
from app.config import settings
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def main() -> None:
|
| 11 |
+
out_dir = Path(settings.vector_index_path).parent
|
| 12 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 13 |
+
svc = RAGService()
|
| 14 |
+
svc.ingest()
|
| 15 |
+
print(f"Index written to: {settings.vector_index_path}")
|
| 16 |
+
print(f"Metadata written to: {settings.vector_metadata_path}")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
if __name__ == "__main__":
|
| 20 |
+
main()
|
| 21 |
+
|