Spaces:

Jayanthk2004
/

rag-mini-project

Sleeping

App Files Files Community

korupolujayanth2004 commited on Aug 16, 2025

Commit

a9bec9f

1 Parent(s): 8372873

Update embed_utils.py

Browse files

Files changed (1) hide show

backend/embed_utils.py +33 -27

backend/embed_utils.py CHANGED Viewed

@@ -1,13 +1,18 @@
 # backend/embed_utils.py
 import os
-import uuid # Added for generating point IDs
-import time # For upload_timestamp
 from sentence_transformers import SentenceTransformer
 # Qdrant client models
 from qdrant_client.http.models import PointStruct, Filter, FieldCondition, MatchValue, Distance, VectorParams, NamedVector, ScrollResult
-from backend.qdrant_client import qdrant_client, KB_COLLECTION # Import Qdrant client and collection name
-from backend.document_loader import Document # Import the Document class definition
 from typing import List
 # === Embedding Model Initialization ===
@@ -30,26 +35,26 @@ def embed_and_store_chunks(documents: List[Document], session_id: str):
     Each chunk is associated with a session_id.
     """
     points = []
-    current_timestamp = str(int(time.time())) # Use a Unix timestamp for when the document was uploaded
     for doc in documents:
         # Generate embedding for the chunk's text content
         embedding = get_embedding(doc.text)
         # Create payload for Qdrant, including chunk details and session ID
         payload = {
             "chunk_id": doc.chunk_id,
             "text": doc.text,
-            "metadata": doc.metadata, # Preserve original metadata from document_loader
-            "session_id": session_id, # CRUCIAL: Associate each chunk with the current session
             "upload_timestamp": current_timestamp,
-            "file_type": doc.metadata.get("file_type", "unknown"), # Get file_type from metadata
-            "source": doc.metadata.get("source", "unknown") # Get source from metadata
         }
         points.append(
             PointStruct(
-                id=str(uuid.uuid4()), # Assign a unique ID for each Qdrant point
                 vector=embedding,
                 payload=payload
             )
@@ -59,7 +64,7 @@ def embed_and_store_chunks(documents: List[Document], session_id: str):
     if points:
         qdrant_client.upsert(
             collection_name=KB_COLLECTION,
-            wait=True, # Wait for the operation to complete
             points=points
         )
         print(f"Stored {len(points)} chunks for session '{session_id}' into '{KB_COLLECTION}'.")
@@ -74,10 +79,10 @@ def search_knowledge_base(query_text: str, session_id: str, top_k: int = 5) -> s
     Returns a concatenated string of the most relevant text chunks.
     """
     if not query_text.strip():
-        return "" # Return empty string if query is empty
     query_embedding = get_embedding(query_text)
     # Construct a filter to ensure we only search within the current session's data
     session_filter = Filter(
         must=[
@@ -87,30 +92,31 @@ def search_knowledge_base(query_text: str, session_id: str, top_k: int = 5) -> s
             )
         ]
     )
     try:
         # Perform the search in Qdrant with the query vector and session filter
         search_result: List[ScrollResult] = qdrant_client.search(
             collection_name=KB_COLLECTION,
             query_vector=query_embedding,
-            query_filter=session_filter, # Apply the session-specific filter
-            limit=top_k, # Number of top results to retrieve
-            with_payload=True # Ensure payload (text and metadata) is returned
         )
         context_chunks = []
         for hit in search_result:
             # Extract the text content from the payload of each relevant hit
             if hit.payload and 'text' in hit.payload:
                 context_chunks.append(hit.payload['text'])
-            # print(f"  Hit: {hit.payload.get('text', '')[:50]}... (Score: {hit.score})") # Debugging line
         if context_chunks:
             # Join relevant chunks into a single string to provide to the LLM
             return "\n\n".join(context_chunks)
         else:
             print(f"No relevant context found in KB for session '{session_id}' and query: '{query_text}'")
-            return "" # No relevant context found for this session
     except Exception as e:
         print(f"Error during knowledge base search for session '{session_id}': {e}")
-        return "" # Return empty string on error

 # backend/embed_utils.py
 import os
+import uuid  # Added for generating point IDs
+import time  # For upload_timestamp
+# Fix cache directory permissions for Hugging Face Spaces
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache_custom"
+os.environ["HF_HOME"] = "/tmp/transformers_cache_custom"
+os.makedirs(os.environ["TRANSFORMERS_CACHE"], exist_ok=True)
 from sentence_transformers import SentenceTransformer
 # Qdrant client models
 from qdrant_client.http.models import PointStruct, Filter, FieldCondition, MatchValue, Distance, VectorParams, NamedVector, ScrollResult
+from backend.qdrant_client import qdrant_client, KB_COLLECTION  # Import Qdrant client and collection name
+from backend.document_loader import Document  # Import the Document class definition
 from typing import List
 # === Embedding Model Initialization ===
     Each chunk is associated with a session_id.
     """
     points = []
+    current_timestamp = str(int(time.time()))  # Use a Unix timestamp for when the document was uploaded
     for doc in documents:
         # Generate embedding for the chunk's text content
         embedding = get_embedding(doc.text)
         # Create payload for Qdrant, including chunk details and session ID
         payload = {
             "chunk_id": doc.chunk_id,
             "text": doc.text,
+            "metadata": doc.metadata,  # Preserve original metadata from document_loader
+            "session_id": session_id,  # CRUCIAL: Associate each chunk with the current session
             "upload_timestamp": current_timestamp,
+            "file_type": doc.metadata.get("file_type", "unknown"),  # Get file_type from metadata
+            "source": doc.metadata.get("source", "unknown")  # Get source from metadata
         }
         points.append(
             PointStruct(
+                id=str(uuid.uuid4()),  # Assign a unique ID for each Qdrant point
                 vector=embedding,
                 payload=payload
             )
     if points:
         qdrant_client.upsert(
             collection_name=KB_COLLECTION,
+            wait=True,  # Wait for the operation to complete
             points=points
         )
         print(f"Stored {len(points)} chunks for session '{session_id}' into '{KB_COLLECTION}'.")
     Returns a concatenated string of the most relevant text chunks.
     """
     if not query_text.strip():
+        return ""  # Return empty string if query is empty
     query_embedding = get_embedding(query_text)
     # Construct a filter to ensure we only search within the current session's data
     session_filter = Filter(
         must=[
             )
         ]
     )
     try:
         # Perform the search in Qdrant with the query vector and session filter
         search_result: List[ScrollResult] = qdrant_client.search(
             collection_name=KB_COLLECTION,
             query_vector=query_embedding,
+            query_filter=session_filter,  # Apply the session-specific filter
+            limit=top_k,  # Number of top results to retrieve
+            with_payload=True  # Ensure payload (text and metadata) is returned
         )
         context_chunks = []
         for hit in search_result:
             # Extract the text content from the payload of each relevant hit
             if hit.payload and 'text' in hit.payload:
                 context_chunks.append(hit.payload['text'])
+            # print(f"  Hit: {hit.payload.get('text', '')[:50]}... (Score: {hit.score})")  # Debugging line
         if context_chunks:
             # Join relevant chunks into a single string to provide to the LLM
             return "\n\n".join(context_chunks)
         else:
             print(f"No relevant context found in KB for session '{session_id}' and query: '{query_text}'")
+            return ""  # No relevant context found for this session
     except Exception as e:
         print(f"Error during knowledge base search for session '{session_id}': {e}")
+        return ""  # Return empty string on error