Spaces:
Runtime error
Runtime error
Commit
·
244f753
1
Parent(s):
e23989a
Change in app.py
Browse files- app.py +17 -10
- populate_db.py +36 -26
app.py
CHANGED
|
@@ -31,8 +31,15 @@ MILVUS_URI = os.getenv("MILVUS_URI", "http://localhost:19530")
|
|
| 31 |
milvus_client = MilvusClient(uri=MILVUS_URI)
|
| 32 |
collection_name = "my_rag_collection"
|
| 33 |
|
|
|
|
| 34 |
if not milvus_client.has_collection(collection_name):
|
| 35 |
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
| 38 |
|
|
@@ -61,7 +68,6 @@ def retrieve_relevant_documents(query: str, limit: int = 5) -> List[Dict]:
|
|
| 61 |
limit=limit,
|
| 62 |
output_fields=["text", "metadata"]
|
| 63 |
)
|
| 64 |
-
# print("search_results:", search_results[0])
|
| 65 |
documents = []
|
| 66 |
for result in search_results[0]:
|
| 67 |
doc_info = {
|
|
@@ -128,15 +134,16 @@ def setup_rag_chain():
|
|
| 128 |
When answering questions, you should:
|
| 129 |
1. Use the provided context documents to inform your response
|
| 130 |
2. Be accurate and helpful
|
| 131 |
-
3.
|
| 132 |
-
4.
|
| 133 |
-
5.
|
| 134 |
-
6.
|
| 135 |
-
7.
|
| 136 |
-
8.
|
| 137 |
-
9. Do not
|
| 138 |
-
10.
|
| 139 |
-
11. If the user
|
|
|
|
| 140 |
|
| 141 |
Context documents:
|
| 142 |
{context}
|
|
|
|
| 31 |
milvus_client = MilvusClient(uri=MILVUS_URI)
|
| 32 |
collection_name = "my_rag_collection"
|
| 33 |
|
| 34 |
+
# Initialize collection once at startup
|
| 35 |
if not milvus_client.has_collection(collection_name):
|
| 36 |
main()
|
| 37 |
+
else:
|
| 38 |
+
# Check if collection has data, populate if empty
|
| 39 |
+
stats = milvus_client.get_collection_stats(collection_name)
|
| 40 |
+
if stats['row_count'] == 0:
|
| 41 |
+
main()
|
| 42 |
+
milvus_client.load_collection(collection_name=collection_name)
|
| 43 |
|
| 44 |
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
| 45 |
|
|
|
|
| 68 |
limit=limit,
|
| 69 |
output_fields=["text", "metadata"]
|
| 70 |
)
|
|
|
|
| 71 |
documents = []
|
| 72 |
for result in search_results[0]:
|
| 73 |
doc_info = {
|
|
|
|
| 134 |
When answering questions, you should:
|
| 135 |
1. Use the provided context documents to inform your response
|
| 136 |
2. Be accurate and helpful
|
| 137 |
+
3. Cite relevant documents in the format [1], [2], etc.
|
| 138 |
+
4. If the context doesn't contain relevant information, say so clearly
|
| 139 |
+
5. Always reply in English
|
| 140 |
+
6. Provide clear recommendations wherever applicable
|
| 141 |
+
7. Do not make assumptions about the user's knowledge or background
|
| 142 |
+
8. If the user asks for a specific law or regulation, provide a brief explanation and cite relevant documents if available.
|
| 143 |
+
9. Do not overlook the importance of accessibility and inclusivity in your responses.
|
| 144 |
+
10. Do not overemphasize disability in your responses, but rather focus on the support and adjustments that can be made to ensure equality and inclusivity.
|
| 145 |
+
11. If the user asks about a specific disability, provide general information and resources, but do not make assumptions about the individual's experience or needs.
|
| 146 |
+
12. If the user query explicitly asks for a disability-related topic, provide a well-informed response based on the context documents.
|
| 147 |
|
| 148 |
Context documents:
|
| 149 |
{context}
|
populate_db.py
CHANGED
|
@@ -13,10 +13,6 @@ MILVUS_URI = os.getenv("MILVUS_URI", "http://localhost:19530")
|
|
| 13 |
milvus_client = MilvusClient(uri=MILVUS_URI)
|
| 14 |
collection_name = "my_rag_collection"
|
| 15 |
|
| 16 |
-
# Drop existing collection if it exists
|
| 17 |
-
# if milvus_client.has_collection(collection_name):
|
| 18 |
-
# milvus_client.drop_collection(collection_name)
|
| 19 |
-
|
| 20 |
# Initialize embedding model
|
| 21 |
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
| 22 |
|
|
@@ -24,35 +20,49 @@ def emb_text(text):
|
|
| 24 |
"""Generate embeddings for text using the sentence transformer model."""
|
| 25 |
return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
field_name="
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
# Create and load collection
|
| 43 |
-
milvus_client.create_collection(
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
)
|
| 49 |
-
milvus_client.load_collection(collection_name=collection_name)
|
| 50 |
|
| 51 |
# Document directory
|
| 52 |
directory_path = "data/"
|
| 53 |
|
| 54 |
def main():
|
| 55 |
"""Main function to load documents and insert them into Milvus."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
docs = unstructured_document_loader()
|
| 57 |
|
| 58 |
# Prepare data for insertion
|
|
|
|
| 13 |
milvus_client = MilvusClient(uri=MILVUS_URI)
|
| 14 |
collection_name = "my_rag_collection"
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
# Initialize embedding model
|
| 17 |
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
| 18 |
|
|
|
|
| 20 |
"""Generate embeddings for text using the sentence transformer model."""
|
| 21 |
return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]
|
| 22 |
|
| 23 |
+
def create_collection():
|
| 24 |
+
"""Create collection if it doesn't exist."""
|
| 25 |
+
if milvus_client.has_collection(collection_name):
|
| 26 |
+
milvus_client.load_collection(collection_name=collection_name)
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
# Create Milvus collection schema
|
| 30 |
+
schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
|
| 31 |
+
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
|
| 32 |
+
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=384) # BGE-small-en-v1.5 dimension
|
| 33 |
+
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=32768) # 32KB max
|
| 34 |
+
schema.add_field(field_name="metadata", datatype=DataType.JSON)
|
| 35 |
+
|
| 36 |
+
# Create index for vector search
|
| 37 |
+
index_params = MilvusClient.prepare_index_params()
|
| 38 |
+
index_params.add_index(
|
| 39 |
+
field_name="vector",
|
| 40 |
+
metric_type="COSINE",
|
| 41 |
+
index_type="AUTOINDEX",
|
| 42 |
+
)
|
| 43 |
|
| 44 |
+
# Create and load collection
|
| 45 |
+
milvus_client.create_collection(
|
| 46 |
+
collection_name=collection_name,
|
| 47 |
+
schema=schema,
|
| 48 |
+
index_params=index_params,
|
| 49 |
+
consistency_level="Strong",
|
| 50 |
+
)
|
| 51 |
+
milvus_client.load_collection(collection_name=collection_name)
|
| 52 |
|
| 53 |
# Document directory
|
| 54 |
directory_path = "data/"
|
| 55 |
|
| 56 |
def main():
|
| 57 |
"""Main function to load documents and insert them into Milvus."""
|
| 58 |
+
create_collection()
|
| 59 |
+
|
| 60 |
+
# Check if collection already has data
|
| 61 |
+
stats = milvus_client.get_collection_stats(collection_name)
|
| 62 |
+
if stats['row_count'] > 0:
|
| 63 |
+
print(f"Collection already contains {stats['row_count']} documents. Skipping insertion.")
|
| 64 |
+
return
|
| 65 |
+
|
| 66 |
docs = unstructured_document_loader()
|
| 67 |
|
| 68 |
# Prepare data for insertion
|