Spaces:

daniel-was-taken
/

CompifAI

Runtime error

daniel-was-taken commited on Jul 28, 2025

Commit

c66410d

1 Parent(s): d878772

Change max_length

Files changed (2) hide show

.gitignore CHANGED Viewed

@@ -223,3 +223,6 @@ secrets/
 # Docker volumes (production)
 volumes/

 # Docker volumes (production)
 volumes/
+simple_analysis.py
+#  This file is used for simple analysis of the codebase, such as checking for unused imports or variables.

populate_db.py CHANGED Viewed

@@ -39,7 +39,7 @@ def create_collection():
     schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
     schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
     schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=4096)  # Qwen/Qwen3-Embedding-8B dimension
-    schema.add_field(field_name="text", datatype=DataType.VARCHAR)  # 32KB max
     schema.add_field(field_name="metadata", datatype=DataType.JSON)
     # Create index for vector search
@@ -82,8 +82,8 @@ def main():
     for i, doc in enumerate(docs):
         # Check text length and truncate if necessary
         text_content = doc.page_content
-        if len(text_content) > 32000:  # Leave some buffer below 32KB limit
-            text_content = text_content[:32000]
             print(f"Document {i+1} truncated from {len(doc.page_content)} to {len(text_content)} characters")
         # Generate embedding for the document content

     schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
     schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
     schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=4096)  # Qwen/Qwen3-Embedding-8B dimension
+    schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65536)  # 64KB max
     schema.add_field(field_name="metadata", datatype=DataType.JSON)
     # Create index for vector search
     for i, doc in enumerate(docs):
         # Check text length and truncate if necessary
         text_content = doc.page_content
+        if len(text_content) > 65000:  # Leave some buffer below 64KB limit
+            text_content = text_content[:65000]
             print(f"Document {i+1} truncated from {len(doc.page_content)} to {len(text_content)} characters")
         # Generate embedding for the document content