daniel-was-taken commited on
Commit
c66410d
·
1 Parent(s): d878772

Change max_length

Browse files
Files changed (2) hide show
  1. .gitignore +3 -0
  2. populate_db.py +3 -3
.gitignore CHANGED
@@ -223,3 +223,6 @@ secrets/
223
 
224
  # Docker volumes (production)
225
  volumes/
 
 
 
 
223
 
224
  # Docker volumes (production)
225
  volumes/
226
+
227
+ simple_analysis.py
228
+ # This file is used for simple analysis of the codebase, such as checking for unused imports or variables.
populate_db.py CHANGED
@@ -39,7 +39,7 @@ def create_collection():
39
  schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
40
  schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
41
  schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=4096) # Qwen/Qwen3-Embedding-8B dimension
42
- schema.add_field(field_name="text", datatype=DataType.VARCHAR) # 32KB max
43
  schema.add_field(field_name="metadata", datatype=DataType.JSON)
44
 
45
  # Create index for vector search
@@ -82,8 +82,8 @@ def main():
82
  for i, doc in enumerate(docs):
83
  # Check text length and truncate if necessary
84
  text_content = doc.page_content
85
- if len(text_content) > 32000: # Leave some buffer below 32KB limit
86
- text_content = text_content[:32000]
87
  print(f"Document {i+1} truncated from {len(doc.page_content)} to {len(text_content)} characters")
88
 
89
  # Generate embedding for the document content
 
39
  schema = milvus_client.create_schema(auto_id=False, enable_dynamic_field=False)
40
  schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
41
  schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=4096) # Qwen/Qwen3-Embedding-8B dimension
42
+ schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65536) # 64KB max
43
  schema.add_field(field_name="metadata", datatype=DataType.JSON)
44
 
45
  # Create index for vector search
 
82
  for i, doc in enumerate(docs):
83
  # Check text length and truncate if necessary
84
  text_content = doc.page_content
85
+ if len(text_content) > 65000: # Leave some buffer below 64KB limit
86
+ text_content = text_content[:65000]
87
  print(f"Document {i+1} truncated from {len(doc.page_content)} to {len(text_content)} characters")
88
 
89
  # Generate embedding for the document content