Spaces:

Qar-Raz
/

NLP-RAG

Sleeping

App Files Files Community

Qar-Raz commited on Apr 5

Commit

8f37cc7

verified ·

1 Parent(s): 860aa5d

Sync backend Docker context from GitHub main

Browse files

Files changed (4) hide show

backend/schemas.py +1 -1
data/ingest.py +42 -42
retriever/retriever.py +3 -3
test.py +29 -18

backend/schemas.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pydantic import BaseModel, Field
 class PredictRequest(BaseModel):
     query: str = Field(..., min_length=1, description="User query text")
     model: str = Field(default="Llama-3-8B", description="Model name key")
-    top_k: int = Field(default=10, ge=1, le=20)
     final_k: int = Field(default=3, ge=1, le=8)
     chunking_technique: str = Field(default="all", description="all | fixed | sentence | paragraph | semantic | recursive | page | markdown")
     mode: str = Field(default="hybrid", description="semantic | bm25 | hybrid")

 class PredictRequest(BaseModel):
     query: str = Field(..., min_length=1, description="User query text")
     model: str = Field(default="Llama-3-8B", description="Model name key")
+    top_k: int = Field(default=50, ge=1, le=100)
     final_k: int = Field(default=3, ge=1, le=8)
     chunking_technique: str = Field(default="all", description="all | fixed | sentence | paragraph | semantic | recursive | page | markdown")
     mode: str = Field(default="hybrid", description="semantic | bm25 | hybrid")

data/ingest.py CHANGED Viewed

@@ -15,55 +15,55 @@ from retriever.processor import ChunkProcessor
 # 6 different chunking techniques for ablation study
 CHUNKING_TECHNIQUES = [
-    {
-        "name": "fixed",
-        "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
-        "chunk_size": 1000,
-        "chunk_overlap": 100,
-        "kwargs": {"separator": ""},  # No separator for fixed splitting
-    },
-    {
-        "name": "sentence",
-        "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
-        "chunk_size": 1000,
-        "chunk_overlap": 100,
-        "kwargs": {},
-    },
-    {
-        "name": "paragraph",
-        "description": "Paragraph-level chunking - uses natural paragraph breaks",
-        "chunk_size": 2500,
-        "chunk_overlap": 100,
-        "kwargs": {"separator": "\n\n"},  # Split on paragraph breaks
-    },
     # {
-    #     "name": "semantic",
-    #     "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
-    #     "chunk_size": 2000,
     #     "chunk_overlap": 100,
-    #     "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
     # },
     {
-        "name": "recursive",
-        "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
         "chunk_size": 2000,
         "chunk_overlap": 100,
-        "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
-    },
-    {
-        "name": "page",
-        "description": "Page-level chunking - uses entire book pages as-is",
-        "chunk_size": 10000,  # Very large to keep full pages
-        "chunk_overlap": 0,   # No overlap between pages
-        "kwargs": {"separator": "--- Page"},  # Split on page markers
-    },
-    {
-        "name": "markdown",
-        "description": "Markdown header chunking - splits by headers (#, ##, ###, ####) with 4k char limit",
-        "chunk_size": 4000,  # Max 4k chars per chunk
-        "chunk_overlap": 0,  # No overlap for markdown
-        "kwargs": {},  # Custom implementation
     },
 ]

 # 6 different chunking techniques for ablation study
 CHUNKING_TECHNIQUES = [
     # {
+    #     "name": "fixed",
+    #     "description": "Fixed-size chunking - splits every N characters (may cut sentences mid-way)",
+    #     "chunk_size": 1000,
+    #     "chunk_overlap": 100,
+    #     "kwargs": {"separator": ""},  # No separator for fixed splitting
+    # },
+    # {
+    #     "name": "sentence",
+    #     "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
+    #     "chunk_size": 1000,
+    #     "chunk_overlap": 100,
+    #     "kwargs": {},
+    # },
+    # {
+    #     "name": "paragraph",
+    #     "description": "Paragraph-level chunking - uses natural paragraph breaks",
+    #     "chunk_size": 2500,
     #     "chunk_overlap": 100,
+    #     "kwargs": {"separator": "\n\n"},  # Split on paragraph breaks
     # },
     {
+        "name": "semantic",
+        "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
         "chunk_size": 2000,
         "chunk_overlap": 100,
+        "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
     },
+    # {
+    #     "name": "recursive",
+    #     "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
+    #     "chunk_size": 2000,
+    #     "chunk_overlap": 100,
+    #     "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
+    # },
+    # {
+    #     "name": "page",
+    #     "description": "Page-level chunking - uses entire book pages as-is",
+    #     "chunk_size": 10000,  # Very large to keep full pages
+    #     "chunk_overlap": 0,   # No overlap between pages
+    #     "kwargs": {"separator": "--- Page"},  # Split on page markers
+    # },
+    # {
+    #     "name": "markdown",
+    #     "description": "Markdown header chunking - splits by headers (#, ##, ###, ####) with 4k char limit",
+    #     "chunk_size": 4000,  # Max 4k chars per chunk
+    #     "chunk_overlap": 0,  # No overlap for markdown
+    #     "kwargs": {},  # Custom implementation
+    # },
 ]

retriever/retriever.py CHANGED Viewed

@@ -324,9 +324,6 @@ class HybridRetriever:
         rerank_time = 0.0
         mmr_time = 0.0
-        if use_mmr:
-            final_k = 10
         if should_print:
             self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
             if requested_technique:
@@ -383,6 +380,9 @@ class HybridRetriever:
             label += " + MMR"
             mmr_time = time.perf_counter() - mmr_start
         if test and rerank_strategy != "cross-encoder" and candidates:
             _, test_scores = self._cross_encoder_rerank(query, candidates, len(candidates))
             avg_chunk_score = float(np.mean(test_scores)) if test_scores else 0.0

         rerank_time = 0.0
         mmr_time = 0.0
         if should_print:
             self._print_search_header(query, mode, rerank_strategy, top_k, final_k)
             if requested_technique:
             label += " + MMR"
             mmr_time = time.perf_counter() - mmr_start
+        # Safety cap: always honor requested final_k regardless of retrieval strategy.
+        candidates = candidates[:final_k]
         if test and rerank_strategy != "cross-encoder" and candidates:
             _, test_scores = self._cross_encoder_rerank(query, candidates, len(candidates))
             avg_chunk_score = float(np.mean(test_scores)) if test_scores else 0.0

test.py CHANGED Viewed

@@ -35,14 +35,14 @@ def generate_retrieval_report(all_results, queries, output_file="retrieval_repor
             chunks = chunks_data.get('chunks', [])
             score = chunks_data.get('score', 0)
-            content += f"**ChunkScore:** {score:.4f} | **Chunks retrieved:** {len(chunks)}\n\n"
             if not chunks:
-                content += "*No chunks retrieved.*\n\n"
             else:
                 for i, chunk in enumerate(chunks, 1):
-                    content += f"**[Chunk {i}]** ({len(chunk)} chars):\n"
-                    content += f"```text\n{chunk}\n```\n\n"
             content += "---\n\n"
@@ -61,11 +61,17 @@ def main():
         raise RuntimeError("PINECONE_API_KEY not found in environment variables")
     test_queries = [
-        "What is cognitive behavior therapy and how does it work?",
-        "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
-        "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
-        "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
-    ]
     # TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
     # Use all 7 chunking techniques from ingest.py
@@ -75,11 +81,16 @@ def main():
         print(f"  - {tech['name']}: {tech['description']}")
     RETRIEVAL_STRATEGIES = [
-        {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr"},
-        {"mode": "semantic", "use_mmr": True,  "label": "semantic-with-mmr"},
-        {"mode": "hybrid",   "use_mmr": False, "label": "hybrid-no-mmr"},
-        {"mode": "hybrid",   "use_mmr": True,  "label": "hybrid-with-mmr"},
-        {"mode": "bm25",     "use_mmr": False, "label": "bm25-no-mmr"},
     ]
     print("Initializing ChunkProcessor to load Embedding Model...")
@@ -117,7 +128,7 @@ def main():
             technique_name = technique['name']
             for strategy in RETRIEVAL_STRATEGIES:
-                result_key = f"{technique_name} + {strategy['label']}"
                 print(f"\nEvaluating: {result_key}")
                 try:
@@ -125,9 +136,9 @@ def main():
                         query=query,
                         index=index,
                         mode=strategy['mode'],
-                        rerank_strategy="cross-encoder",
                         use_mmr=strategy['use_mmr'],
-                        top_k=25,
                         final_k=4,
                         technique_name=technique_name,
                         verbose=False,
@@ -150,4 +161,4 @@ def main():
 if __name__ == '__main__':
-    main()

             chunks = chunks_data.get('chunks', [])
             score = chunks_data.get('score', 0)
+            content += f"*ChunkScore:* {score:.4f} | *Chunks retrieved:* {len(chunks)}\n\n"
             if not chunks:
+                content += "No chunks retrieved.\n\n"
             else:
                 for i, chunk in enumerate(chunks, 1):
+                    content += f"*[Chunk {i}]* ({len(chunk)} chars):\n"
+                    content += f"text\n{chunk}\n\n\n"
             content += "---\n\n"
         raise RuntimeError("PINECONE_API_KEY not found in environment variables")
     test_queries = [
+    "What is cognitive behavior therapy and how does it work?",
+    "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
+    "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
+    "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying.",
+    "My friend didn't text me back for five hours. I'm certain they are mad at me or that I've done something to ruin our friendship.",
+    "Can you explain the difference between a 'situation,' a 'thought,' and an 'emotion' in the context of a CBT thought record?",
+    "I have to do everything perfectly. If I make even one small mistake, it means the entire project is a total disaster and I've wasted everyone's time.",
+    "Whenever I have to give a presentation, my heart starts racing and I'm sure I'm going to have a heart attack or pass out in front of everyone.",
+    "I feel like I'm fundamentally broken and that if people really knew me, they would never want to be around me.",
+    "What is 'behavioral activation' and how can it help someone who is struggling with a lack of motivation or depression?"
+]
     # TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
     # Use all 7 chunking techniques from ingest.py
         print(f"  - {tech['name']}: {tech['description']}")
     RETRIEVAL_STRATEGIES = [
+        {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"cross-encoder"},
+        {"mode": "semantic", "use_mmr": True,  "label": "semantic-with-mmr","rerank_strategy":"cross-encoder"},
+        {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"none"},
+        {"mode": "semantic", "use_mmr": True,  "label": "semantic-with-mmr","rerank_strategy":"none"},
+        {"mode": "hybrid",   "use_mmr": False, "label": "hybrid-no-mmr","rerank_strategy":"cross-encoder"},
+        {"mode": "hybrid",   "use_mmr": True,  "label": "hybrid-with-mmr","rerank_strategy":"rrf"},
+        {"mode": "hybrid",   "use_mmr": True,  "label": "hybrid-with-mmr","rerank_strategy":"cross-encoder"},
+        {"mode": "hybrid",   "use_mmr": False,  "label": "hybrid-with-mmr","rerank_strategy":"rrf"},
+        {"mode": "bm25",     "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"cross-encoder"},
+        {"mode": "bm25",     "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"none"},
     ]
     print("Initializing ChunkProcessor to load Embedding Model...")
             technique_name = technique['name']
             for strategy in RETRIEVAL_STRATEGIES:
+                result_key = f"{technique_name} + {strategy['label']} + {strategy['rerank_strategy']}"
                 print(f"\nEvaluating: {result_key}")
                 try:
                         query=query,
                         index=index,
                         mode=strategy['mode'],
+                        rerank_strategy=strategy['rerank_strategy'],
                         use_mmr=strategy['use_mmr'],
+                        top_k=50,
                         final_k=4,
                         technique_name=technique_name,
                         verbose=False,
 if __name__ == '__main__':
+    main()