Commit ·
1da3dc8
1
Parent(s): 1c23b7c
docs: update README, add module docstrings, CONTRIBUTING and VERSION
Browse files- CONTRIBUTING.md +30 -0
- README.md +10 -4
- VERSION +1 -0
- app/__init__.py +11 -0
- app/analysis.py +97 -0
- app/api.py +24 -6
- app/data_loader.py +7 -0
- app/embedding.py +7 -0
- app/preprocess.py +20 -9
- app/rag_service.py +38 -0
- app/sentiment.py +8 -0
- app/vector_store.py +8 -0
- run.py +7 -0
- scripts/precompute_index.py +8 -0
- scripts/test_queries.py +48 -0
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing and Usage Guide
|
| 2 |
+
|
| 3 |
+
This project implements a Retrieval-Augmented Generation (RAG) service over citizen feedback.
|
| 4 |
+
|
| 5 |
+
Goals:
|
| 6 |
+
- Make the API easy to run locally and deploy to Runpod or any container platform.
|
| 7 |
+
- Keep sensitive keys out of the repo; use environment variables.
|
| 8 |
+
|
| 9 |
+
Quick workflow:
|
| 10 |
+
1. Create branch: `git checkout -b feat/improve-intents`
|
| 11 |
+
2. Make changes and run tests locally.
|
| 12 |
+
3. Commit and push: `git add . && git commit -m "feat: ..." && git push --set-upstream origin feat/improve-intents`
|
| 13 |
+
4. Open a Pull Request and request review.
|
| 14 |
+
|
| 15 |
+
Building the image:
|
| 16 |
+
1. Update `Dockerfile` if you need to pre-bake models.
|
| 17 |
+
2. Build and tag:
|
| 18 |
+
```bash
|
| 19 |
+
docker build -t youruser/feedback-rag:v1 .
|
| 20 |
+
docker push youruser/feedback-rag:v1
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
Run on Runpod:
|
| 24 |
+
- See `README.md` section "Run on Runpod - Full guide" for step-by-step.
|
| 25 |
+
|
| 26 |
+
Tests:
|
| 27 |
+
- No unit tests included yet. Prefer adding `pytest` tests for `app/analysis.py` and the API layer.
|
| 28 |
+
|
| 29 |
+
Contact:
|
| 30 |
+
- For major changes, create an issue first describing the design and performance considerations.
|
README.md
CHANGED
|
@@ -59,6 +59,8 @@ Environment variables:
|
|
| 59 |
- OPENAI_API_KEY: If set, RAG can use OpenAI as a fallback
|
| 60 |
- EMBEDDING_MODEL: Sentence-Transformers model name (default: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
|
| 61 |
- VECTOR_INDEX_PATH: Path to persist FAISS index (default: ./.vector_index/faiss.index)
|
|
|
|
|
|
|
| 62 |
|
| 63 |
### Notes
|
| 64 |
- The first run will download models (embeddings, sentiment); ensure internet access.
|
|
@@ -136,13 +138,17 @@ curl -X POST {YOUR_ENDPOINT_URL}/query \
|
|
| 136 |
-d '{"query":"שיפור טופס", "top_k": 5}' \
|
| 137 |
{YOUR_ENDPOINT_URL}/query
|
| 138 |
```
|
| 139 |
-
- Topics:
|
| 140 |
```
|
| 141 |
-
curl -
|
|
|
|
|
|
|
| 142 |
```
|
| 143 |
-
- Sentiment (first N rows):
|
| 144 |
```
|
| 145 |
-
curl -
|
|
|
|
|
|
|
| 146 |
```
|
| 147 |
- Interactive docs (Swagger UI):
|
| 148 |
- Open `{YOUR_ENDPOINT_URL}/docs` in your browser
|
|
|
|
| 59 |
- OPENAI_API_KEY: If set, RAG can use OpenAI as a fallback
|
| 60 |
- EMBEDDING_MODEL: Sentence-Transformers model name (default: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
|
| 61 |
- VECTOR_INDEX_PATH: Path to persist FAISS index (default: ./.vector_index/faiss.index)
|
| 62 |
+
- VECTOR_METADATA_PATH: Path to persist FAISS index metadata (default: ./.vector_index/meta.parquet)
|
| 63 |
+
- CSV_PATH: Optional path to your CSV (if not `Feedback.csv` in repo root)
|
| 64 |
|
| 65 |
### Notes
|
| 66 |
- The first run will download models (embeddings, sentiment); ensure internet access.
|
|
|
|
| 138 |
-d '{"query":"שיפור טופס", "top_k": 5}' \
|
| 139 |
{YOUR_ENDPOINT_URL}/query
|
| 140 |
```
|
| 141 |
+
- Topics (POST JSON):
|
| 142 |
```
|
| 143 |
+
curl -X POST {YOUR_ENDPOINT_URL}/topics \
|
| 144 |
+
-H "Content-Type: application/json" \
|
| 145 |
+
-d '{"num_topics":8}'
|
| 146 |
```
|
| 147 |
+
- Sentiment (first N rows, POST JSON):
|
| 148 |
```
|
| 149 |
+
curl -X POST {YOUR_ENDPOINT_URL}/sentiment \
|
| 150 |
+
-H "Content-Type: application/json" \
|
| 151 |
+
-d '{"limit":100}'
|
| 152 |
```
|
| 153 |
- Interactive docs (Swagger UI):
|
| 154 |
- Open `{YOUR_ENDPOINT_URL}/docs` in your browser
|
VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
0.1.0
|
app/__init__.py
CHANGED
|
@@ -1,2 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Makes `app` a package so imports like `from app.rag_service import RAGService` work.
|
| 2 |
|
|
|
|
| 1 |
+
"""Application package for the Feedback Analysis RAG Agent.
|
| 2 |
+
|
| 3 |
+
This package contains the core modules that implement ingestion, embedding,
|
| 4 |
+
vector storage and the FastAPI endpoints used by the service.
|
| 5 |
+
|
| 6 |
+
Import example:
|
| 7 |
+
from app.rag_service import RAGService
|
| 8 |
+
|
| 9 |
+
Keep this file minimal — module-level documentation only.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
# Makes `app` a package so imports like `from app.rag_service import RAGService` work.
|
| 13 |
|
app/analysis.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
"""Utilities to detect simple question intents and resolve counts over the feedback corpus.
|
| 4 |
+
|
| 5 |
+
This module implements lightweight, rule-based detection for queries such as:
|
| 6 |
+
- "כמה משתמשים כתבו תודה" -> count thank-you messages
|
| 7 |
+
- "כמה מתלוננים על אלמנטים שלא עובדים" -> count complaint-like messages
|
| 8 |
+
|
| 9 |
+
The approach is intentionally simple (keyword matching) to avoid heavy dependencies and
|
| 10 |
+
to provide fast, explainable counts. It returns structured dicts that higher-level code
|
| 11 |
+
can convert to human-readable summaries or JSON responses.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import re
|
| 15 |
+
from typing import Iterable, List, Optional, Tuple
|
| 16 |
+
|
| 17 |
+
import pandas as pd
|
| 18 |
+
|
| 19 |
+
from .preprocess import preprocess_text
|
| 20 |
+
from .config import settings
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
COMPLAINT_KEYWORDS = [
|
| 24 |
+
"לא עובד",
|
| 25 |
+
"לא עובדים",
|
| 26 |
+
"שגיאה",
|
| 27 |
+
"תקלה",
|
| 28 |
+
"לא פועל",
|
| 29 |
+
"נכשל",
|
| 30 |
+
"לא מצליח",
|
| 31 |
+
"לא ניתן",
|
| 32 |
+
"המערכת לא",
|
| 33 |
+
"לא תקין",
|
| 34 |
+
"לא עובדים להם",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
THANKS_KEYWORDS = ["תודה", "תודה רבה", "תודה!", "תודה רבה!", "תודה רבה מאוד"]
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _contains_any(text: str, keywords: Iterable[str]) -> bool:
|
| 41 |
+
t = preprocess_text(text).lower()
|
| 42 |
+
for kw in keywords:
|
| 43 |
+
if kw in t:
|
| 44 |
+
return True
|
| 45 |
+
return False
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def count_keyword_rows(df: pd.DataFrame, keywords: Iterable[str], text_column: str = "Text") -> int:
|
| 49 |
+
if df is None or df.empty:
|
| 50 |
+
return 0
|
| 51 |
+
kws = [str(k).lower() for k in keywords]
|
| 52 |
+
def row_match(s: str) -> bool:
|
| 53 |
+
s = preprocess_text(str(s)).lower()
|
| 54 |
+
return any(kw in s for kw in kws)
|
| 55 |
+
return int(df[text_column].astype(str).apply(row_match).sum())
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def detect_query_type(query: str) -> Tuple[str, Optional[str]]:
|
| 59 |
+
"""Return (type, target) where type is one of: 'count_thanks', 'count_complaint', 'count_keyword', 'freeform'.
|
| 60 |
+
|
| 61 |
+
target may contain a detected keyword or phrase when relevant.
|
| 62 |
+
"""
|
| 63 |
+
q = preprocess_text(query).lower()
|
| 64 |
+
# Simple Hebrew heuristics
|
| 65 |
+
if "תודה" in q or "מודה" in q:
|
| 66 |
+
return ("count_thanks", None)
|
| 67 |
+
if any(k in q for k in ["לא עובד", "לא עובדים", "תקלה", "שגיאה", "לא פועל", "נכשל"]):
|
| 68 |
+
return ("count_complaint", None)
|
| 69 |
+
# Generic "כמה" count with a keyword after 'על' or 'ל' or 'בש"'
|
| 70 |
+
if q.strip().startswith("כמה") or "כמה משתמשים" in q:
|
| 71 |
+
# try extract noun after 'על' or 'ש' or 'עם'
|
| 72 |
+
m = re.search(r"על\s+([^\n\?]+)", q)
|
| 73 |
+
if m:
|
| 74 |
+
return ("count_keyword", m.group(1).strip())
|
| 75 |
+
m2 = re.search(r"כמה\s+[^\s]+\s+([^\n\?]+)", q)
|
| 76 |
+
if m2:
|
| 77 |
+
return ("count_keyword", m2.group(1).strip())
|
| 78 |
+
return ("count_keyword", None)
|
| 79 |
+
return ("freeform", None)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def resolve_count_from_type(df: pd.DataFrame, qtype: str, target: Optional[str], text_column: str = "Text"):
|
| 83 |
+
if qtype == "count_thanks":
|
| 84 |
+
cnt = count_keyword_rows(df, THANKS_KEYWORDS, text_column=text_column)
|
| 85 |
+
return {"type": "count", "label": "thanks", "count": cnt}
|
| 86 |
+
if qtype == "count_complaint":
|
| 87 |
+
cnt = count_keyword_rows(df, COMPLAINT_KEYWORDS, text_column=text_column)
|
| 88 |
+
return {"type": "count", "label": "complaint_not_working", "count": cnt}
|
| 89 |
+
if qtype == "count_keyword":
|
| 90 |
+
if target:
|
| 91 |
+
# count rows that contain the exact target phrase
|
| 92 |
+
pattern = re.escape(target.lower())
|
| 93 |
+
cnt = int(df[text_column].astype(str).str.lower().str.contains(pattern, regex=True).sum())
|
| 94 |
+
return {"type": "count", "label": f"keyword:{target}", "count": int(cnt)}
|
| 95 |
+
# fallback: return total rows
|
| 96 |
+
return {"type": "count", "label": "all", "count": int(len(df))}
|
| 97 |
+
return {"type": "unknown"}
|
app/api.py
CHANGED
|
@@ -53,7 +53,8 @@ def ingest() -> Dict[str, Any]:
|
|
| 53 |
def query(req: QueryRequest) -> QueryResponse:
|
| 54 |
"""Free-form question answering over feedback data."""
|
| 55 |
try:
|
| 56 |
-
|
|
|
|
| 57 |
return QueryResponse(
|
| 58 |
query=out.query,
|
| 59 |
summary=out.summary,
|
|
@@ -81,9 +82,17 @@ def query(req: QueryRequest) -> QueryResponse:
|
|
| 81 |
)
|
| 82 |
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
try:
|
| 88 |
# Load embeddings from store
|
| 89 |
store = FaissVectorStore.load(settings.vector_index_path, settings.vector_metadata_path)
|
|
@@ -168,8 +177,17 @@ def topics(num_topics: int = Query(5, ge=2, le=50)) -> Dict[str, Any]:
|
|
| 168 |
return {"error": str(e), "num_topics": 0, "topics": {}}
|
| 169 |
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
df = load_feedback().head(limit)
|
| 174 |
texts = df[settings.text_column].astype(str).tolist()
|
| 175 |
out = analyze_sentiments(texts)
|
|
|
|
| 53 |
def query(req: QueryRequest) -> QueryResponse:
|
| 54 |
"""Free-form question answering over feedback data."""
|
| 55 |
try:
|
| 56 |
+
# Use the higher-level answer pipeline which can handle counts and keyword queries
|
| 57 |
+
out = svc.answer(req.query, top_k=req.top_k)
|
| 58 |
return QueryResponse(
|
| 59 |
query=out.query,
|
| 60 |
summary=out.summary,
|
|
|
|
| 82 |
)
|
| 83 |
|
| 84 |
|
| 85 |
+
class TopicsRequest(BaseModel):
|
| 86 |
+
num_topics: int = 5
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@app.post("/topics")
|
| 90 |
+
def topics(req: TopicsRequest) -> Dict[str, Any]:
|
| 91 |
+
"""Extract main topics from feedback. Accepts POST body: {"num_topics": int}.
|
| 92 |
+
|
| 93 |
+
Using POST allows larger and structured request bodies (and avoids URL length limits).
|
| 94 |
+
"""
|
| 95 |
+
num_topics = req.num_topics
|
| 96 |
try:
|
| 97 |
# Load embeddings from store
|
| 98 |
store = FaissVectorStore.load(settings.vector_index_path, settings.vector_metadata_path)
|
|
|
|
| 177 |
return {"error": str(e), "num_topics": 0, "topics": {}}
|
| 178 |
|
| 179 |
|
| 180 |
+
class SentimentRequest(BaseModel):
|
| 181 |
+
limit: int = 100
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
@app.post("/sentiment")
|
| 185 |
+
def sentiment(req: SentimentRequest) -> Dict[str, Any]:
|
| 186 |
+
"""Analyze sentiment for the first `limit` feedback entries. Accepts POST body: {"limit": 100}.
|
| 187 |
+
|
| 188 |
+
Using POST keeps the API consistent for clients that prefer JSON bodies over URL query params.
|
| 189 |
+
"""
|
| 190 |
+
limit = req.limit
|
| 191 |
df = load_feedback().head(limit)
|
| 192 |
texts = df[settings.text_column].astype(str).tolist()
|
| 193 |
out = analyze_sentiments(texts)
|
app/data_loader.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
from .config import settings
|
| 5 |
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
"""Load feedback data from CSV and normalize expected columns.
|
| 4 |
+
|
| 5 |
+
The system expects a CSV with at least the columns: ID, ServiceName, Level, Text.
|
| 6 |
+
`load_feedback` validates the presence of these columns, drops empty text rows,
|
| 7 |
+
and returns a cleaned Pandas DataFrame.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
import pandas as pd
|
| 11 |
from .config import settings
|
| 12 |
|
app/embedding.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from typing import Iterable, List
|
| 4 |
|
| 5 |
import numpy as np
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
"""EmbeddingModel wrapper around sentence-transformers.
|
| 4 |
+
|
| 5 |
+
This class lazily loads a SentenceTransformer model (configured via
|
| 6 |
+
`settings.embedding_model_name`) and exposes `encode` and `encode_single`.
|
| 7 |
+
Normalizes embeddings to unit length for cosine-similarity search in FAISS.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
from typing import Iterable, List
|
| 11 |
|
| 12 |
import numpy as np
|
app/preprocess.py
CHANGED
|
@@ -1,14 +1,25 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
return "unknown"
|
| 13 |
|
| 14 |
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
"""Text preprocessing helpers.
|
| 4 |
+
|
| 5 |
+
Includes minimal normalization and an optional language detection helper. The
|
| 6 |
+
`langdetect` dependency is optional — when it's not installed, `detect_language`
|
| 7 |
+
returns "unknown". This keeps lightweight workflows (like simple counting) runnable
|
| 8 |
+
without installing all NLP dependencies.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from langdetect import detect, DetectorFactory # type: ignore
|
| 13 |
+
DetectorFactory.seed = 42
|
| 14 |
+
|
| 15 |
+
def detect_language(text: str) -> str:
|
| 16 |
+
try:
|
| 17 |
+
return detect(text)
|
| 18 |
+
except Exception:
|
| 19 |
+
return "unknown"
|
| 20 |
+
except Exception:
|
| 21 |
+
# langdetect is optional for lightweight usage; provide fallback
|
| 22 |
+
def detect_language(text: str) -> str:
|
| 23 |
return "unknown"
|
| 24 |
|
| 25 |
|
app/rag_service.py
CHANGED
|
@@ -12,6 +12,7 @@ from .data_loader import load_feedback
|
|
| 12 |
from .embedding import EmbeddingModel
|
| 13 |
from .preprocess import preprocess_text
|
| 14 |
from .vector_store import FaissVectorStore, SearchResult
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
try:
|
|
@@ -111,6 +112,43 @@ class RAGService:
|
|
| 111 |
summary = self.summarize(query, contexts)
|
| 112 |
return RetrievalOutput(query=query, results=results, summary=summary)
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
def main() -> None:
|
| 116 |
parser = argparse.ArgumentParser()
|
|
|
|
| 12 |
from .embedding import EmbeddingModel
|
| 13 |
from .preprocess import preprocess_text
|
| 14 |
from .vector_store import FaissVectorStore, SearchResult
|
| 15 |
+
from .analysis import detect_query_type, resolve_count_from_type
|
| 16 |
|
| 17 |
|
| 18 |
try:
|
|
|
|
| 112 |
summary = self.summarize(query, contexts)
|
| 113 |
return RetrievalOutput(query=query, results=results, summary=summary)
|
| 114 |
|
| 115 |
+
def answer(self, query: str, top_k: int = 5) -> RetrievalOutput:
|
| 116 |
+
"""Higher-level answer pipeline that handles counting/keyword questions explicitly.
|
| 117 |
+
|
| 118 |
+
For queries detected as counts (e.g., thanks, complaints, 'כמה'), compute counts over
|
| 119 |
+
the full dataset and return a short summary plus example contexts from retrieval.
|
| 120 |
+
Falls back to `query` for freeform QA.
|
| 121 |
+
"""
|
| 122 |
+
qtype, target = detect_query_type(query)
|
| 123 |
+
if qtype in ("count_thanks", "count_complaint", "count_keyword"):
|
| 124 |
+
# Use full dataset for accurate counts
|
| 125 |
+
df = load_feedback()
|
| 126 |
+
resolved = resolve_count_from_type(df, qtype, target, text_column=settings.text_column)
|
| 127 |
+
count = int(resolved.get("count", 0))
|
| 128 |
+
# Friendly, language-aware summary
|
| 129 |
+
is_hebrew = any('\u0590' <= ch <= '\u05FF' for ch in query)
|
| 130 |
+
if resolved.get("label") == "thanks":
|
| 131 |
+
summary = (f"{count} משובים מכילים ביטויי תודה." if is_hebrew
|
| 132 |
+
else f"{count} feedback entries contain thanks.")
|
| 133 |
+
elif resolved.get("label") == "complaint_not_working":
|
| 134 |
+
summary = (f"{count} משובים מתארים בעיות/אלמנטים שלא עובדים." if is_hebrew
|
| 135 |
+
else f"{count} feedback entries report elements not working.")
|
| 136 |
+
else:
|
| 137 |
+
label = resolved.get("label", "")
|
| 138 |
+
if label.startswith("keyword:"):
|
| 139 |
+
phrase = label.split("keyword:", 1)[1]
|
| 140 |
+
summary = (f"{count} משובים מכילים את הביטוי '{phrase}'." if is_hebrew
|
| 141 |
+
else f"{count} feedback entries contain the phrase '{phrase}'.")
|
| 142 |
+
else:
|
| 143 |
+
summary = (f"{count} משובים נמצאו." if is_hebrew else f"{count} feedback entries found.")
|
| 144 |
+
|
| 145 |
+
# Provide examples from semantic retrieval for context
|
| 146 |
+
results = self.retrieve(query, top_k=top_k)
|
| 147 |
+
return RetrievalOutput(query=query, results=results, summary=summary)
|
| 148 |
+
|
| 149 |
+
# Fallback to semantic QA
|
| 150 |
+
return self.query(query, top_k=top_k)
|
| 151 |
+
|
| 152 |
|
| 153 |
def main() -> None:
|
| 154 |
parser = argparse.ArgumentParser()
|
app/sentiment.py
CHANGED
|
@@ -1,5 +1,13 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from functools import lru_cache
|
| 4 |
from typing import List, Dict
|
| 5 |
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
"""Sentiment analysis helpers using Hugging Face transformers.
|
| 4 |
+
|
| 5 |
+
This module provides a cached sentiment pipeline to analyze lists of texts.
|
| 6 |
+
The model used (`cardiffnlp/twitter-xlm-roberta-base-sentiment`) is multilingual and
|
| 7 |
+
works reasonably well for short feedback messages. The pipeline is cached to avoid
|
| 8 |
+
reloading the model for each call.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
from functools import lru_cache
|
| 12 |
from typing import List, Dict
|
| 13 |
|
app/vector_store.py
CHANGED
|
@@ -1,5 +1,13 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import os
|
| 4 |
from dataclasses import dataclass
|
| 5 |
from typing import List, Tuple, Optional
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
"""A thin wrapper around FAISS index and a Pandas DataFrame for metadata.
|
| 4 |
+
|
| 5 |
+
FaissVectorStore provides methods to add vectors, perform nearest-neighbor search,
|
| 6 |
+
and persist both the FAISS index and the accompanying metadata (as a parquet file).
|
| 7 |
+
|
| 8 |
+
SearchResult holds the matched index, similarity score and the original metadata row.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
import os
|
| 12 |
from dataclasses import dataclass
|
| 13 |
from typing import List, Tuple, Optional
|
run.py
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import uvicorn # type: ignore
|
|
|
|
| 1 |
+
"""Entrypoint to run the FastAPI application.
|
| 2 |
+
|
| 3 |
+
This module is a tiny wrapper around uvicorn to run the `app.api:app` ASGI
|
| 4 |
+
application on port 8000. Use `python run.py` in development or let the
|
| 5 |
+
container CMD call this file in production.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
import uvicorn # type: ignore
|
scripts/precompute_index.py
CHANGED
|
@@ -1,5 +1,13 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
"""Script to precompute the FAISS vector index locally.
|
| 4 |
+
|
| 5 |
+
When deploying to Runpod it's often useful to precompute embeddings and store
|
| 6 |
+
the FAISS index so the server can start quickly without re-embedding the
|
| 7 |
+
entire dataset on first boot. This script writes the index and metadata to
|
| 8 |
+
the configured `VECTOR_INDEX_PATH` and `VECTOR_METADATA_PATH`.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
import os
|
| 12 |
from pathlib import Path
|
| 13 |
|
scripts/test_queries.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Small harness to demonstrate query type detection and quick counts.
|
| 2 |
+
|
| 3 |
+
This script intentionally keeps heavy dependencies optional: it runs the
|
| 4 |
+
lightweight count logic (keyword-based) directly from the CSV. If the FAISS
|
| 5 |
+
index and embedding dependencies are available, it will also show example
|
| 6 |
+
contexts from semantic retrieval.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from app.data_loader import load_feedback
|
| 12 |
+
from app.analysis import detect_query_type, resolve_count_from_type
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def run_examples():
|
| 16 |
+
examples = [
|
| 17 |
+
"כמה משתמשים מתלוננים על אלמנטים שלא עובדים להם במערכת",
|
| 18 |
+
"כמה משתמשים כתבו תודה",
|
| 19 |
+
"יש תקלות בשירות ההרשמה",
|
| 20 |
+
"מה הבעיות העיקריות שמשתמשים מציינים?",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
df = load_feedback()
|
| 24 |
+
|
| 25 |
+
for q in examples:
|
| 26 |
+
print("\nQuery:", q)
|
| 27 |
+
qtype, target = detect_query_type(q)
|
| 28 |
+
print("Detected type:", qtype, "target:", target)
|
| 29 |
+
resolved = resolve_count_from_type(df, qtype, target)
|
| 30 |
+
if resolved.get("type") == "count":
|
| 31 |
+
print("Count result:", resolved.get("count"), resolved.get("label"))
|
| 32 |
+
else:
|
| 33 |
+
# Fallback to semantic answer (may require heavy deps and a built index). Try to import and run if available.
|
| 34 |
+
try:
|
| 35 |
+
from app.rag_service import RAGService
|
| 36 |
+
svc = RAGService()
|
| 37 |
+
out = svc.answer(q, top_k=3)
|
| 38 |
+
print("Summary:", out.summary)
|
| 39 |
+
for r in out.results:
|
| 40 |
+
print(f"- [{r.score:.3f}] {r.row.get('ServiceName','')} | {r.row.get('Text','')[:120]}")
|
| 41 |
+
except FileNotFoundError:
|
| 42 |
+
print("Vector index not found. Run /ingest or precompute index to see examples.")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print("Semantic retrieval unavailable (missing packages or other error):", e)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
run_examples()
|