Spaces:

daniel-was-taken
/

CompifAI

Runtime error

App Files Files Community

daniel-was-taken commited on Jul 23, 2025

Commit

244f753

1 Parent(s): e23989a

Change in app.py

Browse files

Files changed (2) hide show

app.py +17 -10
populate_db.py +36 -26

app.py CHANGED Viewed

@@ -31,8 +31,15 @@ MILVUS_URI = os.getenv("MILVUS_URI", "http://localhost:19530")
 milvus_client = MilvusClient(uri=MILVUS_URI)
 collection_name = "my_rag_collection"
 if not milvus_client.has_collection(collection_name):
     main()
 embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
@@ -61,7 +68,6 @@ def retrieve_relevant_documents(query: str, limit: int = 5) -> List[Dict]:
             limit=limit,
             output_fields=["text", "metadata"]
         )
-        # print("search_results:", search_results[0])
         documents = []
         for result in search_results[0]:
             doc_info = {
@@ -128,15 +134,16 @@ def setup_rag_chain():
 When answering questions, you should:
 1. Use the provided context documents to inform your response
 2. Be accurate and helpful
-3. If the context doesn't contain relevant information, say so clearly
-4. Always reply in English
-5. Provide clear recommendations wherever applicable
-6. Do not make assumptions about the user's knowledge or background
-7. If the user asks for a specific law or regulation, provide a brief explanation and cite relevant documents if available.
-8. Do not overlook the importance of accessibility and inclusivity in your responses.
-9. Do not overemphasize disability in your responses, but rather focus on the support and adjustments that can be made to ensure equality and inclusivity.
-10. If the user asks about a specific disability, provide general information and resources, but do not make assumptions about the individual's experience or needs.
-11. If the user query explicitly asks for a disability-related topic, provide a well-informed response based on the context documents.
 Context documents:
 {context}

 milvus_client = MilvusClient(uri=MILVUS_URI)
 collection_name = "my_rag_collection"
+# Initialize collection once at startup
 if not milvus_client.has_collection(collection_name):
     main()
+else:
+    # Check if collection has data, populate if empty
+    stats = milvus_client.get_collection_stats(collection_name)
+    if stats['row_count'] == 0:
+        main()
+    milvus_client.load_collection(collection_name=collection_name)
 embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
             limit=limit,
             output_fields=["text", "metadata"]
         )
         documents = []
         for result in search_results[0]:
             doc_info = {
 When answering questions, you should:
 1. Use the provided context documents to inform your response
 2. Be accurate and helpful
+3. Cite relevant documents in the format [1], [2], etc.
+4. If the context doesn't contain relevant information, say so clearly
+5. Always reply in English
+6. Provide clear recommendations wherever applicable
+7. Do not make assumptions about the user's knowledge or background
+8. If the user asks for a specific law or regulation, provide a brief explanation and cite relevant documents if available.
+9. Do not overlook the importance of accessibility and inclusivity in your responses.
+10. Do not overemphasize disability in your responses, but rather focus on the support and adjustments that can be made to ensure equality and inclusivity.
+11. If the user asks about a specific disability, provide general information and resources, but do not make assumptions about the individual's experience or needs.
+12. If the user query explicitly asks for a disability-related topic, provide a well-informed response based on the context documents.
 Context documents:
 {context}

populate_db.py CHANGED Viewed

@@ -13,10 +13,6 @@ MILVUS_URI = os.getenv("MILVUS_URI", "http://localhost:19530")
 milvus_client = MilvusClient(uri=MILVUS_URI)
 collection_name = "my_rag_collection"
-# Drop existing collection if it exists
-# if milvus_client.has_collection(collection_name):
-#     milvus_client.drop_collection(collection_name)
 # Initialize embedding model
 embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
@@ -24,35 +20,49 @@ def emb_text(text):
     """Generate embeddings for text using the sentence transformer model."""
     return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]
-# Create Milvus collection schema
-schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
-schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
-schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=384)  # BGE-small-en-v1.5 dimension
-schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=32768)  # 32KB max
-schema.add_field(field_name="metadata", datatype=DataType.JSON)
-# Create index for vector search
-index_params = MilvusClient.prepare_index_params()
-index_params.add_index(
-    field_name="vector",
-    metric_type="COSINE",
-    index_type="AUTOINDEX",
-)
-# Create and load collection
-milvus_client.create_collection(
-    collection_name=collection_name,
-    schema=schema,
-    index_params=index_params,
-    consistency_level="Strong",
-)
-milvus_client.load_collection(collection_name=collection_name)
 # Document directory
 directory_path = "data/"
 def main():
     """Main function to load documents and insert them into Milvus."""
     docs = unstructured_document_loader()
     # Prepare data for insertion

 milvus_client = MilvusClient(uri=MILVUS_URI)
 collection_name = "my_rag_collection"
 # Initialize embedding model
 embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
     """Generate embeddings for text using the sentence transformer model."""
     return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]
+def create_collection():
+    """Create collection if it doesn't exist."""
+    if milvus_client.has_collection(collection_name):
+        milvus_client.load_collection(collection_name=collection_name)
+        return
+    # Create Milvus collection schema
+    schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
+    schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
+    schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=384)  # BGE-small-en-v1.5 dimension
+    schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=32768)  # 32KB max
+    schema.add_field(field_name="metadata", datatype=DataType.JSON)
+    # Create index for vector search
+    index_params = MilvusClient.prepare_index_params()
+    index_params.add_index(
+        field_name="vector",
+        metric_type="COSINE",
+        index_type="AUTOINDEX",
+    )
+    # Create and load collection
+    milvus_client.create_collection(
+        collection_name=collection_name,
+        schema=schema,
+        index_params=index_params,
+        consistency_level="Strong",
+    )
+    milvus_client.load_collection(collection_name=collection_name)
 # Document directory
 directory_path = "data/"
 def main():
     """Main function to load documents and insert them into Milvus."""
+    create_collection()
+    # Check if collection already has data
+    stats = milvus_client.get_collection_stats(collection_name)
+    if stats['row_count'] > 0:
+        print(f"Collection already contains {stats['row_count']} documents. Skipping insertion.")
+        return
     docs = unstructured_document_loader()
     # Prepare data for insertion