galbendavids commited on
Commit
1da3dc8
·
1 Parent(s): 1c23b7c

docs: update README, add module docstrings, CONTRIBUTING and VERSION

Browse files
CONTRIBUTING.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing and Usage Guide
2
+
3
+ This project implements a Retrieval-Augmented Generation (RAG) service over citizen feedback.
4
+
5
+ Goals:
6
+ - Make the API easy to run locally and deploy to Runpod or any container platform.
7
+ - Keep sensitive keys out of the repo; use environment variables.
8
+
9
+ Quick workflow:
10
+ 1. Create branch: `git checkout -b feat/improve-intents`
11
+ 2. Make changes and run tests locally.
12
+ 3. Commit and push: `git add . && git commit -m "feat: ..." && git push --set-upstream origin feat/improve-intents`
13
+ 4. Open a Pull Request and request review.
14
+
15
+ Building the image:
16
+ 1. Update `Dockerfile` if you need to pre-bake models.
17
+ 2. Build and tag:
18
+ ```bash
19
+ docker build -t youruser/feedback-rag:v1 .
20
+ docker push youruser/feedback-rag:v1
21
+ ```
22
+
23
+ Run on Runpod:
24
+ - See `README.md` section "Run on Runpod - Full guide" for step-by-step.
25
+
26
+ Tests:
27
+ - No unit tests included yet. Prefer adding `pytest` tests for `app/analysis.py` and the API layer.
28
+
29
+ Contact:
30
+ - For major changes, create an issue first describing the design and performance considerations.
README.md CHANGED
@@ -59,6 +59,8 @@ Environment variables:
59
  - OPENAI_API_KEY: If set, RAG can use OpenAI as a fallback
60
  - EMBEDDING_MODEL: Sentence-Transformers model name (default: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
61
  - VECTOR_INDEX_PATH: Path to persist FAISS index (default: ./.vector_index/faiss.index)
 
 
62
 
63
  ### Notes
64
  - The first run will download models (embeddings, sentiment); ensure internet access.
@@ -136,13 +138,17 @@ curl -X POST {YOUR_ENDPOINT_URL}/query \
136
  -d '{"query":"שיפור טופס", "top_k": 5}' \
137
  {YOUR_ENDPOINT_URL}/query
138
  ```
139
- - Topics:
140
  ```
141
- curl -s "{YOUR_ENDPOINT_URL}/topics?num_topics=8"
 
 
142
  ```
143
- - Sentiment (first N rows):
144
  ```
145
- curl -s "{YOUR_ENDPOINT_URL}/sentiment?limit=100"
 
 
146
  ```
147
  - Interactive docs (Swagger UI):
148
  - Open `{YOUR_ENDPOINT_URL}/docs` in your browser
 
59
  - OPENAI_API_KEY: If set, RAG can use OpenAI as a fallback
60
  - EMBEDDING_MODEL: Sentence-Transformers model name (default: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
61
  - VECTOR_INDEX_PATH: Path to persist FAISS index (default: ./.vector_index/faiss.index)
62
+ - VECTOR_METADATA_PATH: Path to persist FAISS index metadata (default: ./.vector_index/meta.parquet)
63
+ - CSV_PATH: Optional path to your CSV (if not `Feedback.csv` in repo root)
64
 
65
  ### Notes
66
  - The first run will download models (embeddings, sentiment); ensure internet access.
 
138
  -d '{"query":"שיפור טופס", "top_k": 5}' \
139
  {YOUR_ENDPOINT_URL}/query
140
  ```
141
+ - Topics (POST JSON):
142
  ```
143
+ curl -X POST {YOUR_ENDPOINT_URL}/topics \
144
+ -H "Content-Type: application/json" \
145
+ -d '{"num_topics":8}'
146
  ```
147
+ - Sentiment (first N rows, POST JSON):
148
  ```
149
+ curl -X POST {YOUR_ENDPOINT_URL}/sentiment \
150
+ -H "Content-Type: application/json" \
151
+ -d '{"limit":100}'
152
  ```
153
  - Interactive docs (Swagger UI):
154
  - Open `{YOUR_ENDPOINT_URL}/docs` in your browser
VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.1.0
app/__init__.py CHANGED
@@ -1,2 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
1
  # Makes `app` a package so imports like `from app.rag_service import RAGService` work.
2
 
 
1
+ """Application package for the Feedback Analysis RAG Agent.
2
+
3
+ This package contains the core modules that implement ingestion, embedding,
4
+ vector storage and the FastAPI endpoints used by the service.
5
+
6
+ Import example:
7
+ from app.rag_service import RAGService
8
+
9
+ Keep this file minimal — module-level documentation only.
10
+ """
11
+
12
  # Makes `app` a package so imports like `from app.rag_service import RAGService` work.
13
 
app/analysis.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ """Utilities to detect simple question intents and resolve counts over the feedback corpus.
4
+
5
+ This module implements lightweight, rule-based detection for queries such as:
6
+ - "כמה משתמשים כתבו תודה" -> count thank-you messages
7
+ - "כמה מתלוננים על אלמנטים שלא עובדים" -> count complaint-like messages
8
+
9
+ The approach is intentionally simple (keyword matching) to avoid heavy dependencies and
10
+ to provide fast, explainable counts. It returns structured dicts that higher-level code
11
+ can convert to human-readable summaries or JSON responses.
12
+ """
13
+
14
+ import re
15
+ from typing import Iterable, List, Optional, Tuple
16
+
17
+ import pandas as pd
18
+
19
+ from .preprocess import preprocess_text
20
+ from .config import settings
21
+
22
+
23
+ COMPLAINT_KEYWORDS = [
24
+ "לא עובד",
25
+ "לא עובדים",
26
+ "שגיאה",
27
+ "תקלה",
28
+ "לא פועל",
29
+ "נכשל",
30
+ "לא מצליח",
31
+ "לא ניתן",
32
+ "המערכת לא",
33
+ "לא תקין",
34
+ "לא עובדים להם",
35
+ ]
36
+
37
+ THANKS_KEYWORDS = ["תודה", "תודה רבה", "תודה!", "תודה רבה!", "תודה רבה מאוד"]
38
+
39
+
40
+ def _contains_any(text: str, keywords: Iterable[str]) -> bool:
41
+ t = preprocess_text(text).lower()
42
+ for kw in keywords:
43
+ if kw in t:
44
+ return True
45
+ return False
46
+
47
+
48
+ def count_keyword_rows(df: pd.DataFrame, keywords: Iterable[str], text_column: str = "Text") -> int:
49
+ if df is None or df.empty:
50
+ return 0
51
+ kws = [str(k).lower() for k in keywords]
52
+ def row_match(s: str) -> bool:
53
+ s = preprocess_text(str(s)).lower()
54
+ return any(kw in s for kw in kws)
55
+ return int(df[text_column].astype(str).apply(row_match).sum())
56
+
57
+
58
+ def detect_query_type(query: str) -> Tuple[str, Optional[str]]:
59
+ """Return (type, target) where type is one of: 'count_thanks', 'count_complaint', 'count_keyword', 'freeform'.
60
+
61
+ target may contain a detected keyword or phrase when relevant.
62
+ """
63
+ q = preprocess_text(query).lower()
64
+ # Simple Hebrew heuristics
65
+ if "תודה" in q or "מודה" in q:
66
+ return ("count_thanks", None)
67
+ if any(k in q for k in ["לא עובד", "לא עובדים", "תקלה", "שגיאה", "לא פועל", "נכשל"]):
68
+ return ("count_complaint", None)
69
+ # Generic "כמה" count with a keyword after 'על' or 'ל' or 'בש"'
70
+ if q.strip().startswith("כמה") or "כמה משתמשים" in q:
71
+ # try extract noun after 'על' or 'ש' or 'עם'
72
+ m = re.search(r"על\s+([^\n\?]+)", q)
73
+ if m:
74
+ return ("count_keyword", m.group(1).strip())
75
+ m2 = re.search(r"כמה\s+[^\s]+\s+([^\n\?]+)", q)
76
+ if m2:
77
+ return ("count_keyword", m2.group(1).strip())
78
+ return ("count_keyword", None)
79
+ return ("freeform", None)
80
+
81
+
82
+ def resolve_count_from_type(df: pd.DataFrame, qtype: str, target: Optional[str], text_column: str = "Text"):
83
+ if qtype == "count_thanks":
84
+ cnt = count_keyword_rows(df, THANKS_KEYWORDS, text_column=text_column)
85
+ return {"type": "count", "label": "thanks", "count": cnt}
86
+ if qtype == "count_complaint":
87
+ cnt = count_keyword_rows(df, COMPLAINT_KEYWORDS, text_column=text_column)
88
+ return {"type": "count", "label": "complaint_not_working", "count": cnt}
89
+ if qtype == "count_keyword":
90
+ if target:
91
+ # count rows that contain the exact target phrase
92
+ pattern = re.escape(target.lower())
93
+ cnt = int(df[text_column].astype(str).str.lower().str.contains(pattern, regex=True).sum())
94
+ return {"type": "count", "label": f"keyword:{target}", "count": int(cnt)}
95
+ # fallback: return total rows
96
+ return {"type": "count", "label": "all", "count": int(len(df))}
97
+ return {"type": "unknown"}
app/api.py CHANGED
@@ -53,7 +53,8 @@ def ingest() -> Dict[str, Any]:
53
  def query(req: QueryRequest) -> QueryResponse:
54
  """Free-form question answering over feedback data."""
55
  try:
56
- out = svc.query(req.query, top_k=req.top_k)
 
57
  return QueryResponse(
58
  query=out.query,
59
  summary=out.summary,
@@ -81,9 +82,17 @@ def query(req: QueryRequest) -> QueryResponse:
81
  )
82
 
83
 
84
- @app.get("/topics")
85
- def topics(num_topics: int = Query(5, ge=2, le=50)) -> Dict[str, Any]:
86
- """Extract main topics from feedback. Returns topics with summaries."""
 
 
 
 
 
 
 
 
87
  try:
88
  # Load embeddings from store
89
  store = FaissVectorStore.load(settings.vector_index_path, settings.vector_metadata_path)
@@ -168,8 +177,17 @@ def topics(num_topics: int = Query(5, ge=2, le=50)) -> Dict[str, Any]:
168
  return {"error": str(e), "num_topics": 0, "topics": {}}
169
 
170
 
171
- @app.get("/sentiment")
172
- def sentiment(limit: int = Query(100, ge=1, le=2000)) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
173
  df = load_feedback().head(limit)
174
  texts = df[settings.text_column].astype(str).tolist()
175
  out = analyze_sentiments(texts)
 
53
  def query(req: QueryRequest) -> QueryResponse:
54
  """Free-form question answering over feedback data."""
55
  try:
56
+ # Use the higher-level answer pipeline which can handle counts and keyword queries
57
+ out = svc.answer(req.query, top_k=req.top_k)
58
  return QueryResponse(
59
  query=out.query,
60
  summary=out.summary,
 
82
  )
83
 
84
 
85
+ class TopicsRequest(BaseModel):
86
+ num_topics: int = 5
87
+
88
+
89
+ @app.post("/topics")
90
+ def topics(req: TopicsRequest) -> Dict[str, Any]:
91
+ """Extract main topics from feedback. Accepts POST body: {"num_topics": int}.
92
+
93
+ Using POST allows larger and structured request bodies (and avoids URL length limits).
94
+ """
95
+ num_topics = req.num_topics
96
  try:
97
  # Load embeddings from store
98
  store = FaissVectorStore.load(settings.vector_index_path, settings.vector_metadata_path)
 
177
  return {"error": str(e), "num_topics": 0, "topics": {}}
178
 
179
 
180
+ class SentimentRequest(BaseModel):
181
+ limit: int = 100
182
+
183
+
184
+ @app.post("/sentiment")
185
+ def sentiment(req: SentimentRequest) -> Dict[str, Any]:
186
+ """Analyze sentiment for the first `limit` feedback entries. Accepts POST body: {"limit": 100}.
187
+
188
+ Using POST keeps the API consistent for clients that prefer JSON bodies over URL query params.
189
+ """
190
+ limit = req.limit
191
  df = load_feedback().head(limit)
192
  texts = df[settings.text_column].astype(str).tolist()
193
  out = analyze_sentiments(texts)
app/data_loader.py CHANGED
@@ -1,5 +1,12 @@
1
  from __future__ import annotations
2
 
 
 
 
 
 
 
 
3
  import pandas as pd
4
  from .config import settings
5
 
 
1
  from __future__ import annotations
2
 
3
+ """Load feedback data from CSV and normalize expected columns.
4
+
5
+ The system expects a CSV with at least the columns: ID, ServiceName, Level, Text.
6
+ `load_feedback` validates the presence of these columns, drops empty text rows,
7
+ and returns a cleaned Pandas DataFrame.
8
+ """
9
+
10
  import pandas as pd
11
  from .config import settings
12
 
app/embedding.py CHANGED
@@ -1,5 +1,12 @@
1
  from __future__ import annotations
2
 
 
 
 
 
 
 
 
3
  from typing import Iterable, List
4
 
5
  import numpy as np
 
1
  from __future__ import annotations
2
 
3
+ """EmbeddingModel wrapper around sentence-transformers.
4
+
5
+ This class lazily loads a SentenceTransformer model (configured via
6
+ `settings.embedding_model_name`) and exposes `encode` and `encode_single`.
7
+ Normalizes embeddings to unit length for cosine-similarity search in FAISS.
8
+ """
9
+
10
  from typing import Iterable, List
11
 
12
  import numpy as np
app/preprocess.py CHANGED
@@ -1,14 +1,25 @@
1
  from __future__ import annotations
2
 
3
- from langdetect import detect, DetectorFactory # type: ignore
4
-
5
- DetectorFactory.seed = 42
6
-
7
-
8
- def detect_language(text: str) -> str:
9
- try:
10
- return detect(text)
11
- except Exception:
 
 
 
 
 
 
 
 
 
 
 
12
  return "unknown"
13
 
14
 
 
1
  from __future__ import annotations
2
 
3
+ """Text preprocessing helpers.
4
+
5
+ Includes minimal normalization and an optional language detection helper. The
6
+ `langdetect` dependency is optional — when it's not installed, `detect_language`
7
+ returns "unknown". This keeps lightweight workflows (like simple counting) runnable
8
+ without installing all NLP dependencies.
9
+ """
10
+
11
+ try:
12
+ from langdetect import detect, DetectorFactory # type: ignore
13
+ DetectorFactory.seed = 42
14
+
15
+ def detect_language(text: str) -> str:
16
+ try:
17
+ return detect(text)
18
+ except Exception:
19
+ return "unknown"
20
+ except Exception:
21
+ # langdetect is optional for lightweight usage; provide fallback
22
+ def detect_language(text: str) -> str:
23
  return "unknown"
24
 
25
 
app/rag_service.py CHANGED
@@ -12,6 +12,7 @@ from .data_loader import load_feedback
12
  from .embedding import EmbeddingModel
13
  from .preprocess import preprocess_text
14
  from .vector_store import FaissVectorStore, SearchResult
 
15
 
16
 
17
  try:
@@ -111,6 +112,43 @@ class RAGService:
111
  summary = self.summarize(query, contexts)
112
  return RetrievalOutput(query=query, results=results, summary=summary)
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  def main() -> None:
116
  parser = argparse.ArgumentParser()
 
12
  from .embedding import EmbeddingModel
13
  from .preprocess import preprocess_text
14
  from .vector_store import FaissVectorStore, SearchResult
15
+ from .analysis import detect_query_type, resolve_count_from_type
16
 
17
 
18
  try:
 
112
  summary = self.summarize(query, contexts)
113
  return RetrievalOutput(query=query, results=results, summary=summary)
114
 
115
+ def answer(self, query: str, top_k: int = 5) -> RetrievalOutput:
116
+ """Higher-level answer pipeline that handles counting/keyword questions explicitly.
117
+
118
+ For queries detected as counts (e.g., thanks, complaints, 'כמה'), compute counts over
119
+ the full dataset and return a short summary plus example contexts from retrieval.
120
+ Falls back to `query` for freeform QA.
121
+ """
122
+ qtype, target = detect_query_type(query)
123
+ if qtype in ("count_thanks", "count_complaint", "count_keyword"):
124
+ # Use full dataset for accurate counts
125
+ df = load_feedback()
126
+ resolved = resolve_count_from_type(df, qtype, target, text_column=settings.text_column)
127
+ count = int(resolved.get("count", 0))
128
+ # Friendly, language-aware summary
129
+ is_hebrew = any('\u0590' <= ch <= '\u05FF' for ch in query)
130
+ if resolved.get("label") == "thanks":
131
+ summary = (f"{count} משובים מכילים ביטויי תודה." if is_hebrew
132
+ else f"{count} feedback entries contain thanks.")
133
+ elif resolved.get("label") == "complaint_not_working":
134
+ summary = (f"{count} משובים מתארים בעיות/אלמנטים שלא עובדים." if is_hebrew
135
+ else f"{count} feedback entries report elements not working.")
136
+ else:
137
+ label = resolved.get("label", "")
138
+ if label.startswith("keyword:"):
139
+ phrase = label.split("keyword:", 1)[1]
140
+ summary = (f"{count} משובים מכילים את הביטוי '{phrase}'." if is_hebrew
141
+ else f"{count} feedback entries contain the phrase '{phrase}'.")
142
+ else:
143
+ summary = (f"{count} משובים נמצאו." if is_hebrew else f"{count} feedback entries found.")
144
+
145
+ # Provide examples from semantic retrieval for context
146
+ results = self.retrieve(query, top_k=top_k)
147
+ return RetrievalOutput(query=query, results=results, summary=summary)
148
+
149
+ # Fallback to semantic QA
150
+ return self.query(query, top_k=top_k)
151
+
152
 
153
  def main() -> None:
154
  parser = argparse.ArgumentParser()
app/sentiment.py CHANGED
@@ -1,5 +1,13 @@
1
  from __future__ import annotations
2
 
 
 
 
 
 
 
 
 
3
  from functools import lru_cache
4
  from typing import List, Dict
5
 
 
1
  from __future__ import annotations
2
 
3
+ """Sentiment analysis helpers using Hugging Face transformers.
4
+
5
+ This module provides a cached sentiment pipeline to analyze lists of texts.
6
+ The model used (`cardiffnlp/twitter-xlm-roberta-base-sentiment`) is multilingual and
7
+ works reasonably well for short feedback messages. The pipeline is cached to avoid
8
+ reloading the model for each call.
9
+ """
10
+
11
  from functools import lru_cache
12
  from typing import List, Dict
13
 
app/vector_store.py CHANGED
@@ -1,5 +1,13 @@
1
  from __future__ import annotations
2
 
 
 
 
 
 
 
 
 
3
  import os
4
  from dataclasses import dataclass
5
  from typing import List, Tuple, Optional
 
1
  from __future__ import annotations
2
 
3
+ """A thin wrapper around FAISS index and a Pandas DataFrame for metadata.
4
+
5
+ FaissVectorStore provides methods to add vectors, perform nearest-neighbor search,
6
+ and persist both the FAISS index and the accompanying metadata (as a parquet file).
7
+
8
+ SearchResult holds the matched index, similarity score and the original metadata row.
9
+ """
10
+
11
  import os
12
  from dataclasses import dataclass
13
  from typing import List, Tuple, Optional
run.py CHANGED
@@ -1,3 +1,10 @@
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
  import uvicorn # type: ignore
 
1
+ """Entrypoint to run the FastAPI application.
2
+
3
+ This module is a tiny wrapper around uvicorn to run the `app.api:app` ASGI
4
+ application on port 8000. Use `python run.py` in development or let the
5
+ container CMD call this file in production.
6
+ """
7
+
8
  from __future__ import annotations
9
 
10
  import uvicorn # type: ignore
scripts/precompute_index.py CHANGED
@@ -1,5 +1,13 @@
1
  from __future__ import annotations
2
 
 
 
 
 
 
 
 
 
3
  import os
4
  from pathlib import Path
5
 
 
1
  from __future__ import annotations
2
 
3
+ """Script to precompute the FAISS vector index locally.
4
+
5
+ When deploying to Runpod it's often useful to precompute embeddings and store
6
+ the FAISS index so the server can start quickly without re-embedding the
7
+ entire dataset on first boot. This script writes the index and metadata to
8
+ the configured `VECTOR_INDEX_PATH` and `VECTOR_METADATA_PATH`.
9
+ """
10
+
11
  import os
12
  from pathlib import Path
13
 
scripts/test_queries.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Small harness to demonstrate query type detection and quick counts.
2
+
3
+ This script intentionally keeps heavy dependencies optional: it runs the
4
+ lightweight count logic (keyword-based) directly from the CSV. If the FAISS
5
+ index and embedding dependencies are available, it will also show example
6
+ contexts from semantic retrieval.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from app.data_loader import load_feedback
12
+ from app.analysis import detect_query_type, resolve_count_from_type
13
+
14
+
15
+ def run_examples():
16
+ examples = [
17
+ "כמה משתמשים מתלוננים על אלמנטים שלא עובדים להם במערכת",
18
+ "כמה משתמשים כתבו תודה",
19
+ "יש תקלות בשירות ההרשמה",
20
+ "מה הבעיות העיקריות שמשתמשים מציינים?",
21
+ ]
22
+
23
+ df = load_feedback()
24
+
25
+ for q in examples:
26
+ print("\nQuery:", q)
27
+ qtype, target = detect_query_type(q)
28
+ print("Detected type:", qtype, "target:", target)
29
+ resolved = resolve_count_from_type(df, qtype, target)
30
+ if resolved.get("type") == "count":
31
+ print("Count result:", resolved.get("count"), resolved.get("label"))
32
+ else:
33
+ # Fallback to semantic answer (may require heavy deps and a built index). Try to import and run if available.
34
+ try:
35
+ from app.rag_service import RAGService
36
+ svc = RAGService()
37
+ out = svc.answer(q, top_k=3)
38
+ print("Summary:", out.summary)
39
+ for r in out.results:
40
+ print(f"- [{r.score:.3f}] {r.row.get('ServiceName','')} | {r.row.get('Text','')[:120]}")
41
+ except FileNotFoundError:
42
+ print("Vector index not found. Run /ingest or precompute index to see examples.")
43
+ except Exception as e:
44
+ print("Semantic retrieval unavailable (missing packages or other error):", e)
45
+
46
+
47
+ if __name__ == "__main__":
48
+ run_examples()