Spaces:

galbendavids
/

feedback-analysis-agent

Sleeping

App Files Files Community

galbendavids commited on Nov 12, 2025

Commit

1da3dc8

1 Parent(s): 1c23b7c

docs: update README, add module docstrings, CONTRIBUTING and VERSION

Browse files

Files changed (15) hide show

CONTRIBUTING.md +30 -0
README.md +10 -4
VERSION +1 -0
app/__init__.py +11 -0
app/analysis.py +97 -0
app/api.py +24 -6
app/data_loader.py +7 -0
app/embedding.py +7 -0
app/preprocess.py +20 -9
app/rag_service.py +38 -0
app/sentiment.py +8 -0
app/vector_store.py +8 -0
run.py +7 -0
scripts/precompute_index.py +8 -0
scripts/test_queries.py +48 -0

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Contributing and Usage Guide
+This project implements a Retrieval-Augmented Generation (RAG) service over citizen feedback.
+Goals:
+- Make the API easy to run locally and deploy to Runpod or any container platform.
+- Keep sensitive keys out of the repo; use environment variables.
+Quick workflow:
+1. Create branch: `git checkout -b feat/improve-intents`
+2. Make changes and run tests locally.
+3. Commit and push: `git add . && git commit -m "feat: ..." && git push --set-upstream origin feat/improve-intents`
+4. Open a Pull Request and request review.
+Building the image:
+1. Update `Dockerfile` if you need to pre-bake models.
+2. Build and tag:
+   ```bash
+   docker build -t youruser/feedback-rag:v1 .
+   docker push youruser/feedback-rag:v1
+   ```
+Run on Runpod:
+- See `README.md` section "Run on Runpod - Full guide" for step-by-step.
+Tests:
+- No unit tests included yet. Prefer adding `pytest` tests for `app/analysis.py` and the API layer.
+Contact:
+- For major changes, create an issue first describing the design and performance considerations.

README.md CHANGED Viewed

@@ -59,6 +59,8 @@ Environment variables:
 - OPENAI_API_KEY: If set, RAG can use OpenAI as a fallback
 - EMBEDDING_MODEL: Sentence-Transformers model name (default: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
 - VECTOR_INDEX_PATH: Path to persist FAISS index (default: ./.vector_index/faiss.index)
 ### Notes
 - The first run will download models (embeddings, sentiment); ensure internet access.
@@ -136,13 +138,17 @@ curl -X POST {YOUR_ENDPOINT_URL}/query \
   -d '{"query":"שיפור טופס", "top_k": 5}' \
   {YOUR_ENDPOINT_URL}/query
 ```
-- Topics:
 ```
-curl -s "{YOUR_ENDPOINT_URL}/topics?num_topics=8"
 ```
-- Sentiment (first N rows):
 ```
-curl -s "{YOUR_ENDPOINT_URL}/sentiment?limit=100"
 ```
 - Interactive docs (Swagger UI):
   - Open `{YOUR_ENDPOINT_URL}/docs` in your browser

 - OPENAI_API_KEY: If set, RAG can use OpenAI as a fallback
 - EMBEDDING_MODEL: Sentence-Transformers model name (default: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
 - VECTOR_INDEX_PATH: Path to persist FAISS index (default: ./.vector_index/faiss.index)
+- VECTOR_METADATA_PATH: Path to persist FAISS index metadata (default: ./.vector_index/meta.parquet)
+- CSV_PATH: Optional path to your CSV (if not `Feedback.csv` in repo root)
 ### Notes
 - The first run will download models (embeddings, sentiment); ensure internet access.
   -d '{"query":"שיפור טופס", "top_k": 5}' \
   {YOUR_ENDPOINT_URL}/query
 ```
+- Topics (POST JSON):
 ```
+curl -X POST {YOUR_ENDPOINT_URL}/topics \
+  -H "Content-Type: application/json" \
+  -d '{"num_topics":8}'
 ```
+- Sentiment (first N rows, POST JSON):
 ```
+curl -X POST {YOUR_ENDPOINT_URL}/sentiment \
+  -H "Content-Type: application/json" \
+  -d '{"limit":100}'
 ```
 - Interactive docs (Swagger UI):
   - Open `{YOUR_ENDPOINT_URL}/docs` in your browser

VERSION ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.1.0

app/__init__.py CHANGED Viewed

	@@ -1,2 +1,13 @@











1	# Makes `app` a package so imports like `from app.rag_service import RAGService` work.
2

+"""Application package for the Feedback Analysis RAG Agent.
+This package contains the core modules that implement ingestion, embedding,
+vector storage and the FastAPI endpoints used by the service.
+Import example:
+	from app.rag_service import RAGService
+Keep this file minimal — module-level documentation only.
+"""
 # Makes `app` a package so imports like `from app.rag_service import RAGService` work.

app/analysis.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from __future__ import annotations
+"""Utilities to detect simple question intents and resolve counts over the feedback corpus.
+This module implements lightweight, rule-based detection for queries such as:
+- "כמה משתמשים כתבו תודה" -> count thank-you messages
+- "כמה מתלוננים על אלמנטים שלא עובדים" -> count complaint-like messages
+The approach is intentionally simple (keyword matching) to avoid heavy dependencies and
+to provide fast, explainable counts. It returns structured dicts that higher-level code
+can convert to human-readable summaries or JSON responses.
+"""
+import re
+from typing import Iterable, List, Optional, Tuple
+import pandas as pd
+from .preprocess import preprocess_text
+from .config import settings
+COMPLAINT_KEYWORDS = [
+    "לא עובד",
+    "לא עובדים",
+    "שגיאה",
+    "תקלה",
+    "לא פועל",
+    "נכשל",
+    "לא מצליח",
+    "לא ניתן",
+    "המערכת לא",
+    "לא תקין",
+    "לא עובדים להם",
+]
+THANKS_KEYWORDS = ["תודה", "תודה רבה", "תודה!", "תודה רבה!", "תודה רבה מאוד"]
+def _contains_any(text: str, keywords: Iterable[str]) -> bool:
+    t = preprocess_text(text).lower()
+    for kw in keywords:
+        if kw in t:
+            return True
+    return False
+def count_keyword_rows(df: pd.DataFrame, keywords: Iterable[str], text_column: str = "Text") -> int:
+    if df is None or df.empty:
+        return 0
+    kws = [str(k).lower() for k in keywords]
+    def row_match(s: str) -> bool:
+        s = preprocess_text(str(s)).lower()
+        return any(kw in s for kw in kws)
+    return int(df[text_column].astype(str).apply(row_match).sum())
+def detect_query_type(query: str) -> Tuple[str, Optional[str]]:
+    """Return (type, target) where type is one of: 'count_thanks', 'count_complaint', 'count_keyword', 'freeform'.
+    target may contain a detected keyword or phrase when relevant.
+    """
+    q = preprocess_text(query).lower()
+    # Simple Hebrew heuristics
+    if "תודה" in q or "מודה" in q:
+        return ("count_thanks", None)
+    if any(k in q for k in ["לא עובד", "לא עובדים", "תקלה", "שגיאה", "לא פועל", "נכשל"]):
+        return ("count_complaint", None)
+    # Generic "כמה" count with a keyword after 'על' or 'ל' or 'בש"'
+    if q.strip().startswith("כמה") or "כמה משתמשים" in q:
+        # try extract noun after 'על' or 'ש' or 'עם'
+        m = re.search(r"על\s+([^\n\?]+)", q)
+        if m:
+            return ("count_keyword", m.group(1).strip())
+        m2 = re.search(r"כמה\s+[^\s]+\s+([^\n\?]+)", q)
+        if m2:
+            return ("count_keyword", m2.group(1).strip())
+        return ("count_keyword", None)
+    return ("freeform", None)
+def resolve_count_from_type(df: pd.DataFrame, qtype: str, target: Optional[str], text_column: str = "Text"):
+    if qtype == "count_thanks":
+        cnt = count_keyword_rows(df, THANKS_KEYWORDS, text_column=text_column)
+        return {"type": "count", "label": "thanks", "count": cnt}
+    if qtype == "count_complaint":
+        cnt = count_keyword_rows(df, COMPLAINT_KEYWORDS, text_column=text_column)
+        return {"type": "count", "label": "complaint_not_working", "count": cnt}
+    if qtype == "count_keyword":
+        if target:
+            # count rows that contain the exact target phrase
+            pattern = re.escape(target.lower())
+            cnt = int(df[text_column].astype(str).str.lower().str.contains(pattern, regex=True).sum())
+            return {"type": "count", "label": f"keyword:{target}", "count": int(cnt)}
+        # fallback: return total rows
+        return {"type": "count", "label": "all", "count": int(len(df))}
+    return {"type": "unknown"}

app/api.py CHANGED Viewed

@@ -53,7 +53,8 @@ def ingest() -> Dict[str, Any]:
 def query(req: QueryRequest) -> QueryResponse:
     """Free-form question answering over feedback data."""
     try:
-        out = svc.query(req.query, top_k=req.top_k)
         return QueryResponse(
             query=out.query,
             summary=out.summary,
@@ -81,9 +82,17 @@ def query(req: QueryRequest) -> QueryResponse:
         )
-@app.get("/topics")
-def topics(num_topics: int = Query(5, ge=2, le=50)) -> Dict[str, Any]:
-    """Extract main topics from feedback. Returns topics with summaries."""
     try:
         # Load embeddings from store
         store = FaissVectorStore.load(settings.vector_index_path, settings.vector_metadata_path)
@@ -168,8 +177,17 @@ def topics(num_topics: int = Query(5, ge=2, le=50)) -> Dict[str, Any]:
         return {"error": str(e), "num_topics": 0, "topics": {}}
-@app.get("/sentiment")
-def sentiment(limit: int = Query(100, ge=1, le=2000)) -> Dict[str, Any]:
     df = load_feedback().head(limit)
     texts = df[settings.text_column].astype(str).tolist()
     out = analyze_sentiments(texts)

 def query(req: QueryRequest) -> QueryResponse:
     """Free-form question answering over feedback data."""
     try:
+        # Use the higher-level answer pipeline which can handle counts and keyword queries
+        out = svc.answer(req.query, top_k=req.top_k)
         return QueryResponse(
             query=out.query,
             summary=out.summary,
         )
+class TopicsRequest(BaseModel):
+    num_topics: int = 5
+@app.post("/topics")
+def topics(req: TopicsRequest) -> Dict[str, Any]:
+    """Extract main topics from feedback. Accepts POST body: {"num_topics": int}.
+    Using POST allows larger and structured request bodies (and avoids URL length limits).
+    """
+    num_topics = req.num_topics
     try:
         # Load embeddings from store
         store = FaissVectorStore.load(settings.vector_index_path, settings.vector_metadata_path)
         return {"error": str(e), "num_topics": 0, "topics": {}}
+class SentimentRequest(BaseModel):
+    limit: int = 100
+@app.post("/sentiment")
+def sentiment(req: SentimentRequest) -> Dict[str, Any]:
+    """Analyze sentiment for the first `limit` feedback entries. Accepts POST body: {"limit": 100}.
+    Using POST keeps the API consistent for clients that prefer JSON bodies over URL query params.
+    """
+    limit = req.limit
     df = load_feedback().head(limit)
     texts = df[settings.text_column].astype(str).tolist()
     out = analyze_sentiments(texts)

app/data_loader.py CHANGED Viewed

@@ -1,5 +1,12 @@
 from __future__ import annotations
 import pandas as pd
 from .config import settings

 from __future__ import annotations
+"""Load feedback data from CSV and normalize expected columns.
+The system expects a CSV with at least the columns: ID, ServiceName, Level, Text.
+`load_feedback` validates the presence of these columns, drops empty text rows,
+and returns a cleaned Pandas DataFrame.
+"""
 import pandas as pd
 from .config import settings

app/embedding.py CHANGED Viewed

@@ -1,5 +1,12 @@
 from __future__ import annotations
 from typing import Iterable, List
 import numpy as np

 from __future__ import annotations
+"""EmbeddingModel wrapper around sentence-transformers.
+This class lazily loads a SentenceTransformer model (configured via
+`settings.embedding_model_name`) and exposes `encode` and `encode_single`.
+Normalizes embeddings to unit length for cosine-similarity search in FAISS.
+"""
 from typing import Iterable, List
 import numpy as np

app/preprocess.py CHANGED Viewed

@@ -1,14 +1,25 @@
 from __future__ import annotations
-from langdetect import detect, DetectorFactory  # type: ignore
-DetectorFactory.seed = 42
-def detect_language(text: str) -> str:
-    try:
-        return detect(text)
-    except Exception:
         return "unknown"

 from __future__ import annotations
+"""Text preprocessing helpers.
+Includes minimal normalization and an optional language detection helper. The
+`langdetect` dependency is optional — when it's not installed, `detect_language`
+returns "unknown". This keeps lightweight workflows (like simple counting) runnable
+without installing all NLP dependencies.
+"""
+try:
+    from langdetect import detect, DetectorFactory  # type: ignore
+    DetectorFactory.seed = 42
+    def detect_language(text: str) -> str:
+        try:
+            return detect(text)
+        except Exception:
+            return "unknown"
+except Exception:
+    # langdetect is optional for lightweight usage; provide fallback
+    def detect_language(text: str) -> str:
         return "unknown"

app/rag_service.py CHANGED Viewed

@@ -12,6 +12,7 @@ from .data_loader import load_feedback
 from .embedding import EmbeddingModel
 from .preprocess import preprocess_text
 from .vector_store import FaissVectorStore, SearchResult
 try:
@@ -111,6 +112,43 @@ class RAGService:
         summary = self.summarize(query, contexts)
         return RetrievalOutput(query=query, results=results, summary=summary)
 def main() -> None:
     parser = argparse.ArgumentParser()

 from .embedding import EmbeddingModel
 from .preprocess import preprocess_text
 from .vector_store import FaissVectorStore, SearchResult
+from .analysis import detect_query_type, resolve_count_from_type
 try:
         summary = self.summarize(query, contexts)
         return RetrievalOutput(query=query, results=results, summary=summary)
+    def answer(self, query: str, top_k: int = 5) -> RetrievalOutput:
+        """Higher-level answer pipeline that handles counting/keyword questions explicitly.
+        For queries detected as counts (e.g., thanks, complaints, 'כמה'), compute counts over
+        the full dataset and return a short summary plus example contexts from retrieval.
+        Falls back to `query` for freeform QA.
+        """
+        qtype, target = detect_query_type(query)
+        if qtype in ("count_thanks", "count_complaint", "count_keyword"):
+            # Use full dataset for accurate counts
+            df = load_feedback()
+            resolved = resolve_count_from_type(df, qtype, target, text_column=settings.text_column)
+            count = int(resolved.get("count", 0))
+            # Friendly, language-aware summary
+            is_hebrew = any('\u0590' <= ch <= '\u05FF' for ch in query)
+            if resolved.get("label") == "thanks":
+                summary = (f"{count} משובים מכילים ביטויי תודה." if is_hebrew
+                           else f"{count} feedback entries contain thanks.")
+            elif resolved.get("label") == "complaint_not_working":
+                summary = (f"{count} משובים מתארים בעיות/אלמנטים שלא עובדים." if is_hebrew
+                           else f"{count} feedback entries report elements not working.")
+            else:
+                label = resolved.get("label", "")
+                if label.startswith("keyword:"):
+                    phrase = label.split("keyword:", 1)[1]
+                    summary = (f"{count} משובים מכילים את הביטוי '{phrase}'." if is_hebrew
+                               else f"{count} feedback entries contain the phrase '{phrase}'.")
+                else:
+                    summary = (f"{count} משובים נמצאו." if is_hebrew else f"{count} feedback entries found.")
+            # Provide examples from semantic retrieval for context
+            results = self.retrieve(query, top_k=top_k)
+            return RetrievalOutput(query=query, results=results, summary=summary)
+        # Fallback to semantic QA
+        return self.query(query, top_k=top_k)
 def main() -> None:
     parser = argparse.ArgumentParser()

app/sentiment.py CHANGED Viewed

@@ -1,5 +1,13 @@
 from __future__ import annotations
 from functools import lru_cache
 from typing import List, Dict

 from __future__ import annotations
+"""Sentiment analysis helpers using Hugging Face transformers.
+This module provides a cached sentiment pipeline to analyze lists of texts.
+The model used (`cardiffnlp/twitter-xlm-roberta-base-sentiment`) is multilingual and
+works reasonably well for short feedback messages. The pipeline is cached to avoid
+reloading the model for each call.
+"""
 from functools import lru_cache
 from typing import List, Dict

app/vector_store.py CHANGED Viewed

@@ -1,5 +1,13 @@
 from __future__ import annotations
 import os
 from dataclasses import dataclass
 from typing import List, Tuple, Optional

 from __future__ import annotations
+"""A thin wrapper around FAISS index and a Pandas DataFrame for metadata.
+FaissVectorStore provides methods to add vectors, perform nearest-neighbor search,
+and persist both the FAISS index and the accompanying metadata (as a parquet file).
+SearchResult holds the matched index, similarity score and the original metadata row.
+"""
 import os
 from dataclasses import dataclass
 from typing import List, Tuple, Optional

run.py CHANGED Viewed

@@ -1,3 +1,10 @@
 from __future__ import annotations
 import uvicorn  # type: ignore

+"""Entrypoint to run the FastAPI application.
+This module is a tiny wrapper around uvicorn to run the `app.api:app` ASGI
+application on port 8000. Use `python run.py` in development or let the
+container CMD call this file in production.
+"""
 from __future__ import annotations
 import uvicorn  # type: ignore

scripts/precompute_index.py CHANGED Viewed

@@ -1,5 +1,13 @@
 from __future__ import annotations
 import os
 from pathlib import Path

 from __future__ import annotations
+"""Script to precompute the FAISS vector index locally.
+When deploying to Runpod it's often useful to precompute embeddings and store
+the FAISS index so the server can start quickly without re-embedding the
+entire dataset on first boot. This script writes the index and metadata to
+the configured `VECTOR_INDEX_PATH` and `VECTOR_METADATA_PATH`.
+"""
 import os
 from pathlib import Path

scripts/test_queries.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""Small harness to demonstrate query type detection and quick counts.
+This script intentionally keeps heavy dependencies optional: it runs the
+lightweight count logic (keyword-based) directly from the CSV. If the FAISS
+index and embedding dependencies are available, it will also show example
+contexts from semantic retrieval.
+"""
+from __future__ import annotations
+from app.data_loader import load_feedback
+from app.analysis import detect_query_type, resolve_count_from_type
+def run_examples():
+    examples = [
+        "כמה משתמשים מתלוננים על אלמנטים שלא עובדים להם במערכת",
+        "כמה משתמשים כתבו תודה",
+        "יש תקלות בשירות ההרשמה",
+        "מה הבעיות העיקריות שמשתמשים מציינים?",
+    ]
+    df = load_feedback()
+    for q in examples:
+        print("\nQuery:", q)
+        qtype, target = detect_query_type(q)
+        print("Detected type:", qtype, "target:", target)
+        resolved = resolve_count_from_type(df, qtype, target)
+        if resolved.get("type") == "count":
+            print("Count result:", resolved.get("count"), resolved.get("label"))
+        else:
+            # Fallback to semantic answer (may require heavy deps and a built index). Try to import and run if available.
+            try:
+                from app.rag_service import RAGService
+                svc = RAGService()
+                out = svc.answer(q, top_k=3)
+                print("Summary:", out.summary)
+                for r in out.results:
+                    print(f"- [{r.score:.3f}] {r.row.get('ServiceName','')} | {r.row.get('Text','')[:120]}")
+            except FileNotFoundError:
+                print("Vector index not found. Run /ingest or precompute index to see examples.")
+            except Exception as e:
+                print("Semantic retrieval unavailable (missing packages or other error):", e)
+if __name__ == "__main__":
+    run_examples()