Spaces:

pluto90
/

Smart-Notes-backend

Running

App Files Files Community

pluto90 commited on Apr 12

Commit

f06dea6

verified ·

1 Parent(s): f383d3f

Upload 6 files

Browse files

Files changed (5) hide show

app/core/embedding_engine.py +233 -160
app/core/llm_engine.py +41 -19
app/core/pdf_processor.py +52 -120
app/core/rag_service.py +55 -58
app/core/text_splitter.py +22 -0

app/core/embedding_engine.py CHANGED Viewed

@@ -1,160 +1,233 @@
-# # embedding_engine.py
-# import uuid
-# from qdrant_client import QdrantClient, models
-# from qdrant_client.http.models import Distance, VectorParams
-# from sentence_transformers import SentenceTransformer
-# from app.core.config import QDRANT_URL, QDRANT_API_KEY
-# # from config import QDRANT_URL, QDRANT_API_KEY
-# # embedder = SentenceTransformer("all-MiniLM-L6-v2")
-# # embedder.save("models/all-MiniLM-L6-v2")
-# MODEL_PATH = "app/core/models/all-MiniLM-L6-v2"
-# embedder = SentenceTransformer(MODEL_PATH)
-# qdrant = QdrantClient(
-#     url=QDRANT_URL,
-#     api_key=QDRANT_API_KEY,
-#     check_compatibility=False
-#     )
-# COLLECTION_NAME = "smartnotes"
-# BATCH_SIZE = 100
-# def ensure_collection():
-#     collections = qdrant.get_collections().collections
-#     if COLLECTION_NAME not in [c.name for c in collections]:
-#         qdrant.create_collection(
-#             collection_name=COLLECTION_NAME,
-#             vectors_config=VectorParams(
-#                 size=384,
-#                 distance=Distance.COSINE
-#             ),
-#         )
-#             # ✅ Add this part
-#     qdrant.create_payload_index(
-#         collection_name=COLLECTION_NAME,
-#         field_name="doc_id",
-#         field_schema="keyword"
-#     )
-# def embed_and_store(text_chunks, doc_id):
-#     """Embed chunks and store them in Qdrant efficiently."""
-#     ensure_collection()
-#     print(f"🔹 Embedding {len(text_chunks)} chunks...")
-#     # Generate embeddings
-#     vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
-#     # Prepare points
-#     points = [
-#         models.PointStruct(
-#             id=str(uuid.uuid4()),
-#             vector=vectors[i],
-#             payload={"doc_id": doc_id, "text": text_chunks[i]},
-#         )
-#         for i in range(len(vectors))
-#     ]
-#     # ✅ Upsert in small batches to avoid timeouts
-#     print("🔹 Uploading to Qdrant in batches...")
-#     for i in range(0, len(points), BATCH_SIZE):
-#         batch = points[i:i + BATCH_SIZE]
-#         qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
-#         print(f"   → Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
-#     print("✅ All embeddings stored successfully!")
-# embedding_engine.py
-import uuid
-from qdrant_client import QdrantClient, models
-from qdrant_client.http.models import Distance, VectorParams
-from sentence_transformers import SentenceTransformer
-from app.core.config import QDRANT_URL, QDRANT_API_KEY
-# embedder = SentenceTransformer("all-MiniLM-L6-v2")
-# embedder.save("models/all-MiniLM-L6-v2")
-MODEL_PATH = "app/core/models/all-MiniLM-L6-v2"
-embedder = SentenceTransformer(MODEL_PATH)
-qdrant = QdrantClient(
-    url=QDRANT_URL,
-    api_key=QDRANT_API_KEY,
-    check_compatibility=False
-    )
-COLLECTION_NAME = "smartnotes"
-BATCH_SIZE = 100
-def ensure_collection():
-    collections = qdrant.get_collections().collections
-    if COLLECTION_NAME not in [c.name for c in collections]:
-        qdrant.create_collection(
-            collection_name=COLLECTION_NAME,
-            vectors_config=VectorParams(
-                size=384,
-                distance=Distance.COSINE
-            ),
-        )
-            # ✅ Add this part
-    qdrant.create_payload_index(
-        collection_name=COLLECTION_NAME,
-        field_name="doc_id",
-        field_schema="keyword"
-    )
-def embed_and_store(text_chunks, doc_id):
-    """Embed chunks and store them in Qdrant efficiently."""
-    ensure_collection()
-    print(f"🔹 Embedding {len(text_chunks)} chunks...")
-    # Generate embeddings
-    vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
-    # Prepare points
-    points = [
-        models.PointStruct(
-            id=str(uuid.uuid4()),
-            vector=vectors[i],
-            payload={"doc_id": doc_id, "text": text_chunks[i]},
-        )
-        for i in range(len(vectors))
-    ]
-    # ✅ Upsert in small batches to avoid timeouts
-    print("🔹 Uploading to Qdrant in batches...")
-    for i in range(0, len(points), BATCH_SIZE):
-        batch = points[i:i + BATCH_SIZE]
-        qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
-        print(f"   → Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
-    print("✅ All embeddings stored successfully!")

+# # embedding_engine.py
+# import uuid, time
+# from qdrant_client import QdrantClient, models
+# from qdrant_client.http.models import Distance, VectorParams
+# from qdrant_client.http.exceptions import UnexpectedResponse
+# from sentence_transformers import SentenceTransformer
+# from app.core.config import QDRANT_URL, QDRANT_API_KEY
+# MODEL_PATH = "app/core/models/bge-base-en-v1.5"
+# embedder = SentenceTransformer(MODEL_PATH)
+# qdrant = QdrantClient(
+#     url=QDRANT_URL,
+#     api_key=QDRANT_API_KEY,
+#     check_compatibility=False
+#     )
+# COLLECTION_NAME = "smartnotes"
+# BATCH_SIZE = 10
+# def ensure_collection():
+#     collections = qdrant.get_collections().collections
+#     if COLLECTION_NAME not in [c.name for c in collections]:
+#         qdrant.create_collection(
+#             collection_name=COLLECTION_NAME,
+#             vectors_config=VectorParams(
+#                 size=768,
+#                 distance=Distance.COSINE
+#             ),
+#         )
+#             # ✅ Add this part
+#     qdrant.create_payload_index(
+#         collection_name=COLLECTION_NAME,
+#         field_name="doc_id",
+#         field_schema="keyword"
+#     )
+# def embed_and_store(text_chunks, doc_id):
+#     print(f"📊 Embedding and storing {len(text_chunks)} chunks...")
+#     ensure_collection()
+#     print(f"🔹 Embedding {len(text_chunks)} chunks...")
+#     vectors = embed_documents(text_chunks)
+#     points = [
+#         models.PointStruct(
+#             id=str(uuid.uuid4()),
+#             vector=vectors[i],
+#             payload={
+#                 "doc_id": doc_id,
+#                 "text": text_chunks[i],
+#                 "chunk_id": i,
+#                 "length": len(text_chunks[i])
+#             },
+#         )
+#         for i in range(len(vectors))
+#     ]
+#     print("🔹 Uploading to Qdrant in batches...")
+#     for i in range(0, len(points), BATCH_SIZE):
+#         batch = points[i:i + BATCH_SIZE]
+#         success = False
+#         retries = 3
+#         while not success and retries > 0:
+#             try:
+#                 qdrant.upsert(
+#                     collection_name=COLLECTION_NAME,
+#                     points=batch
+#                 )
+#                 success = True
+#                 print(f"   → Uploaded batch {i // BATCH_SIZE + 1}")
+#             except Exception as e:
+#                 print("❌ Qdrant error:", e)
+#                 retries -= 1
+#                 time.sleep(1.5)   # 🔥 increase wait
+#         if not success:
+#             print("⚠️ Skipping batch after retries")
+#         time.sleep(0.4)  # 🔥 throttle
+# def embed_documents(texts):
+#     vectors= []
+#     for i in range(0, len(texts), 32):
+#         batch = texts[i:i+32]
+#         batch_vectors = embedder.encode(batch, show_progress_bar=False)
+#         vectors.extend(batch_vectors.tolist())
+#     return vectors
+# def embed_query(text):
+#     return embedder.encode(
+#         f"query: {text}",
+#         normalize_embeddings=True
+#     )
+# embedding_engine.py
+import uuid, time
+from qdrant_client import QdrantClient, models
+from qdrant_client.http.models import Distance, VectorParams
+from sentence_transformers import SentenceTransformer
+from app.core.config import QDRANT_URL, QDRANT_API_KEY
+MODEL_PATH = "app/core/models/bge-base-en-v1.5"
+embedder = SentenceTransformer(MODEL_PATH)
+qdrant = QdrantClient(
+    url=QDRANT_URL,
+    api_key=QDRANT_API_KEY,
+    check_compatibility=False
+)
+COLLECTION_NAME = "smartnotes"
+BATCH_SIZE = 5  # ✅ reduced for free tier
+def ensure_collection():
+    collections = qdrant.get_collections().collections
+    if COLLECTION_NAME not in [c.name for c in collections]:
+        qdrant.create_collection(
+            collection_name=COLLECTION_NAME,
+            vectors_config=VectorParams(size=768, distance=Distance.COSINE),
+        )
+    qdrant.create_payload_index(
+        collection_name=COLLECTION_NAME,
+        field_name="doc_id",
+        field_schema="keyword"
+    )
+def embed_and_store(text_chunks, doc_id):
+    print(f"📊 Final chunks being embedded: {len(text_chunks)}")
+    ensure_collection()
+    vectors = embed_documents(text_chunks)  # ✅ now uses correct doc prefix
+    points = [
+        models.PointStruct(
+            id=str(uuid.uuid4()),
+            vector=vectors[i],
+            payload={
+                "doc_id": doc_id,
+                "text": text_chunks[i],
+                "chunk_id": i,
+                "length": len(text_chunks[i])
+            },
+        )
+        for i in range(len(vectors))
+    ]
+    failed_batches = []
+    for i in range(0, len(points), BATCH_SIZE):
+        batch = points[i:i + BATCH_SIZE]
+        batch_num = i // BATCH_SIZE + 1
+        success = False
+        for attempt in range(4):  # ✅ 4 attempts with exponential backoff
+            try:
+                qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
+                success = True
+                print(f"   → Batch {batch_num} uploaded")
+                break
+            except Exception as e:
+                wait = 2 ** attempt  # 1s, 2s, 4s, 8s
+                print(f"   ⚠️ Batch {batch_num} attempt {attempt+1} failed: {e} | retrying in {wait}s")
+                time.sleep(wait)
+        if not success:
+            failed_batches.append(batch_num)
+            print(f"   ❌ Batch {batch_num} permanently failed")
+        time.sleep(0.6)  # ✅ throttle between successful batches
+    if failed_batches:
+        # ✅ raise so the caller (routes.py) knows something went wrong
+        raise RuntimeError(f"Failed to upload batches: {failed_batches}")
+    print(f"✅ All batches uploaded for doc_id={doc_id}")
+def embed_documents(texts):
+    """Embed document chunks with correct BGE prefix and normalization."""
+    prefixed = [f"Represent this sentence: {t}" for t in texts]  # ✅ correct BGE doc prefix
+    vectors = []
+    for i in range(0, len(prefixed), 32):
+        batch = prefixed[i:i + 32]
+        batch_vectors = embedder.encode(
+            batch, normalize_embeddings=True, show_progress_bar=False)
+        vectors.extend(batch_vectors.tolist())
+    return vectors
+def embed_query(text):
+    """Embed a search query — BGE uses 'query:' prefix for retrieval."""
+    return embedder.encode(
+        f"query: {text}",
+        normalize_embeddings=True
+    ).tolist()  # ✅ always return list, not numpy array

app/core/llm_engine.py CHANGED Viewed

@@ -1,19 +1,41 @@
-# llm_engine.py
-import google.generativeai as genai
-from app.core.config import GEMINI_API_KEY
-from langchain_google_genai import ChatGoogleGenerativeAI
-# ✅ Configure Gemini client
-genai.configure(api_key=GEMINI_API_KEY)
-llm = ChatGoogleGenerativeAI(
-    model="gemini-2.5-flash",
-    google_api_key=GEMINI_API_KEY,
-    temperature=0.2,
-    max_output_tokens=500,
-    convert_system_message_to_human=True
-)

+# # llm_engine.py
+import google.generativeai as genai
+from app.core.config import GEMINI_API_KEY
+from langchain_google_genai import ChatGoogleGenerativeAI
+# ✅ Configure Gemini client
+genai.configure(api_key=GEMINI_API_KEY)
+llm = ChatGoogleGenerativeAI(
+    model="gemini-2.5-flash",
+    google_api_key=GEMINI_API_KEY,
+    temperature=0.2,
+    max_output_tokens=800,
+)
+# # ✅ Separate LLM for evaluator — needs near-deterministic JSON output
+# eval_llm = ChatGoogleGenerativeAI(
+#     model="gemini-2.5-flash",
+#     google_api_key=GEMINI_API_KEY,
+#     temperature=0.0,              # ✅ deterministic — evaluator must return valid JSON
+#     max_output_tokens=200,        # ✅ evaluator only returns a small JSON blob
+#     thinking_level="none" # to disable chain-of-thought
+# )
+eval_llm = ChatGoogleGenerativeAI(
+    model="gemini-2.0-flash",  # no thinking, faster
+    google_api_key=GEMINI_API_KEY,
+    temperature=0.0,
+    max_output_tokens=200,
+    # model_kwargs={
+    #     "generation_config": {
+    #         "thinking_config": {
+    #             "thinking_budget": 0  # ✅ 0 = disabled, bypasses langchain validation entirely
+    #         }
+    #     }
+    # }
+)

app/core/pdf_processor.py CHANGED Viewed

@@ -1,120 +1,52 @@
-# # pdf_preprocessor.py
-# import os
-# from pypdf import PdfReader
-# from pdf2image import convert_from_path
-# import pytesseract
-# # Optional: Set Tesseract path manually on Windows
-# # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
-# def extract_text_from_pdf(file_path: str) -> str:
-#     """
-#     Extract text from both text-based and image-based PDFs.
-#     Falls back to OCR using pytesseract if no embedded text is found.
-#     """
-#     text_output = []
-#     reader = PdfReader(file_path)
-#     total_pages = len(reader.pages)
-#     print(f"📄 Processing PDF: {file_path} ({total_pages} pages)")
-#     for page_num, page in enumerate(reader.pages, start=1):
-#         try:
-#             # Try normal text extraction
-#             extracted_text = page.extract_text()
-#             if extracted_text and extracted_text.strip():
-#                 text_output.append(extracted_text)
-#                 print(f"✅ Page {page_num}: Extracted embedded text.")
-#             else:
-#                 # Run OCR if no text found
-#                 print(f"🔍 Page {page_num}: No text found, running OCR...")
-#                 images = convert_from_path(
-#                     file_path, first_page=page_num, last_page=page_num
-#                 )
-#                 ocr_text = ""
-#                 for img in images:
-#                     ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
-#                 if ocr_text.strip():
-#                     text_output.append(ocr_text)
-#                     print(f"🧠 Page {page_num}: OCR extraction complete.")
-#                 else:
-#                     print(f"⚠️ Page {page_num}: OCR found no readable text.")
-#         except Exception as e:
-#             print(f"❌ Error processing page {page_num}: {e}")
-#     full_text = "\n".join(text_output)
-#     if not full_text.strip():
-#         print("⚠️ Warning: No text extracted from this PDF at all.")
-#     else:
-#         print(f"✅ Done! Extracted {len(full_text.split())} words total.")
-#     return full_text
-# pdf_preprocessor.py
-import os
-from pypdf import PdfReader
-from pdf2image import convert_from_path
-import pytesseract
-# Optional: Set Tesseract path manually on Windows
-# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
-def extract_text_from_pdf(file_path: str) -> str:
-    """
-    Extract text from both text-based and image-based PDFs.
-    Falls back to OCR using pytesseract if no embedded text is found.
-    """
-    text_output = []
-    reader = PdfReader(file_path)
-    total_pages = len(reader.pages)
-    print(f"📄 Processing PDF: {file_path} ({total_pages} pages)")
-    for page_num, page in enumerate(reader.pages, start=1):
-        try:
-            # Try normal text extraction
-            extracted_text = page.extract_text()
-            if extracted_text and extracted_text.strip():
-                text_output.append(extracted_text)
-                print(f"✅ Page {page_num}: Extracted embedded text.")
-            else:
-                # Run OCR if no text found
-                print(f"🔍 Page {page_num}: No text found, running OCR...")
-                images = convert_from_path(
-                    file_path, first_page=page_num, last_page=page_num
-                )
-                ocr_text = ""
-                for img in images:
-                    ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
-                if ocr_text.strip():
-                    text_output.append(ocr_text)
-                    print(f"🧠 Page {page_num}: OCR extraction complete.")
-                else:
-                    print(f"⚠️ Page {page_num}: OCR found no readable text.")
-        except Exception as e:
-            print(f"❌ Error processing page {page_num}: {e}")
-    full_text = "\n".join(text_output)
-    if not full_text.strip():
-        print("⚠️ Warning: No text extracted from this PDF at all.")
-    else:
-        print(f"✅ Done! Extracted {len(full_text.split())} words total.")
-    return full_text

+# pdf_preprocessor.py
+import os
+from pypdf import PdfReader
+from pdf2image import convert_from_path
+import pytesseract
+# Optional: Set Tesseract path manually on Windows
+# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+def extract_text_from_pdf(file_path: str) -> str:
+    """
+    Extract text from both text-based and image-based PDFs.
+    Falls back to OCR using pytesseract if no embedded text is found.
+    """
+    text_output = []
+    reader = PdfReader(file_path)
+    total_pages = len(reader.pages)
+    print(f"📄 Processing PDF: {file_path} ({total_pages} pages)")
+    for page_num, page in enumerate(reader.pages, start=1):
+        try:
+            # Try normal text extraction
+            extracted_text = page.extract_text()
+            if extracted_text and extracted_text.strip():
+                text_output.append(extracted_text)
+                print(f"✅ Page {page_num}: Extracted embedded text.")
+            else:
+                # Run OCR if no text found
+                print(f"🔍 Page {page_num}: No text found, running OCR...")
+                images = convert_from_path(
+                    file_path, first_page=page_num, last_page=page_num
+                )
+                ocr_text = ""
+                for img in images:
+                    ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
+                if ocr_text.strip():
+                    text_output.append(ocr_text)
+                    print(f"🧠 Page {page_num}: OCR extraction complete.")
+                else:
+                    print(f"⚠️ Page {page_num}: OCR found no readable text.")
+        except Exception as e:
+            print(f"❌ Error processing page {page_num}: {e}")
+    full_text = "\n\n".join(text_output)
+    if not full_text.strip():
+        print("⚠️ Warning: No text extracted from this PDF at all.")
+    else:
+        print(f"✅ Done! Extracted {len(full_text.split())} words total.")
+    return full_text

app/core/rag_service.py CHANGED Viewed

@@ -1,55 +1,34 @@
-# app/core/rag_service.py
-from app.core.embedding_engine import embedder, COLLECTION_NAME
-from qdrant_client.http.models import Filter, FieldCondition, MatchValue
-from qdrant_client import QdrantClient
-from app.core.config import QDRANT_URL, QDRANT_API_KEY
-qdrant_client = QdrantClient(
-    url=QDRANT_URL,
-    api_key=QDRANT_API_KEY,
-    check_compatibility=False
-)
-# def get_rag_context(question: str, doc_id: str):
-#     question_vector = embedder.encode([question])[0].tolist()
-#     hits = qdrant_client.query_points(
-#         collection_name=COLLECTION_NAME,
-#         query=question_vector,
-#         query_filter=Filter(
-#             must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
-#         ),
-#         limit=5,
-#     ).points
-#     # context = "\n".join([hit.payload["text"] for hit in hits])
-#     contexts = []
-#     sources = []
-#     for hit in hits:
-#         text = hit.payload.get("text", "")
-#         contexts.append(text)
-#         sources.append({
-#             "text": text[:300],   # limit for UI
-#             # add page if you have it later
-#         })
-#     context = "\n".join(contexts)
-#     return context, sources
 # def get_rag_context(query, doc_id, top_k=3):
 #     query_vector = embedder.encode(query).tolist()
 #     results = qdrant_client.query_points(
-#         collection_name=doc_id,
 #         query=query_vector,
-#         limit=top_k
 #     )
 #     points = results.points
@@ -65,23 +44,38 @@ qdrant_client = QdrantClient(
-def get_rag_context(query, doc_id, top_k=3):
-    # ✅ Embed query
-    query_vector = embedder.encode(query).tolist()
-    # ✅ Query SINGLE collection + filter by doc_id
     results = qdrant_client.query_points(
-        collection_name="smartnotes",   # 🔥 FIXED
         query=query_vector,
         limit=top_k,
         query_filter=Filter(
-            must=[
-                FieldCondition(
-                    key="doc_id",
-                    match=MatchValue(value=doc_id)
-                )
-            ]
         )
     )
@@ -90,9 +84,12 @@ def get_rag_context(query, doc_id, top_k=3):
     if not points:
         return "", [], []
-    context = "\n".join([p.payload["text"] for p in points])
-    sources = [p.payload.get("source") for p in points]
     scores = [p.score for p in points]
     return context, sources, scores

+# # app/core/rag_service.py
+# from app.core.embedding_engine import embedder, COLLECTION_NAME
+# from qdrant_client.http.models import Filter, FieldCondition, MatchValue
+# from qdrant_client import QdrantClient
+# from app.core.config import QDRANT_URL, QDRANT_API_KEY
+# qdrant_client = QdrantClient(
+#     url=QDRANT_URL,
+#     api_key=QDRANT_API_KEY,
+#     check_compatibility=False
+# )
 # def get_rag_context(query, doc_id, top_k=3):
+#     # ✅ Embed query
 #     query_vector = embedder.encode(query).tolist()
+#     # ✅ Query SINGLE collection + filter by doc_id
 #     results = qdrant_client.query_points(
+#         collection_name="smartnotes",   # 🔥 FIXED
 #         query=query_vector,
+#         limit=top_k,
+#         query_filter=Filter(
+#             must=[
+#                 FieldCondition(
+#                     key="doc_id",
+#                     match=MatchValue(value=doc_id)
+#                 )
+#             ]
+#         )
 #     )
 #     points = results.points
+# app/core/rag_service.py
+from app.core.embedding_engine import embed_query, COLLECTION_NAME  # ✅ use the correct function
+from qdrant_client.http.models import Filter, FieldCondition, MatchValue
+from qdrant_client import QdrantClient
+from app.core.config import QDRANT_URL, QDRANT_API_KEY
+qdrant_client = QdrantClient(
+    url=QDRANT_URL,
+    api_key=QDRANT_API_KEY,
+    check_compatibility=False
+)
+def get_rag_context(query, doc_id, top_k=5):  # ✅ top_k=5 for better recall
+    query_vector = embed_query(query)  # ✅ uses "query: " prefix + returns list
     results = qdrant_client.query_points(
+        collection_name=COLLECTION_NAME,
         query=query_vector,
         limit=top_k,
+        score_threshold=0.35,  # ✅ filter truly irrelevant results early
         query_filter=Filter(
+            must=[FieldCondition(key="doc_id", match=MatchValue(value=doc_id))]
         )
     )
     if not points:
         return "", [], []
+    context = "\n\n---\n\n".join([p.payload["text"] for p in points])  # ✅ clearer separator
+    sources = [p.payload.get("chunk_id", i) for i, p in enumerate(points)]
     scores = [p.score for p in points]
     return context, sources, scores

app/core/text_splitter.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# text_splitter.py
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+splitter = RecursiveCharacterTextSplitter(
+    chunk_size=500,
+    chunk_overlap=150,
+    separators=["\n\n", "\n", ".", " ", ""]
+)
+def split_text(text):
+    chunks = splitter.split_text(text)
+    # 🔥 CLEANING STEP (VERY IMPORTANT)
+    cleaned_chunks = []
+    for chunk in chunks:
+        chunk = chunk.strip()
+        if len(chunk) > 50:   # ❌ remove tiny garbage chunks
+            cleaned_chunks.append(chunk)
+    return cleaned_chunks