Spaces:

mtyrrell
/

chabo_test

Sleeping

mtyrrell commited on Jan 21

Commit

5c394c9

1 Parent(s): 5b77e39

ingestor fix

Files changed (2) hide show

params.cfg CHANGED Viewed

@@ -7,6 +7,7 @@ reranker_endpoint_url = https://whikfgijnuog8fjv.eu-west-1.aws.endpoints.hugging
 # for native just give url
 mode = native
 url = https://de438521-e2dd-43d9-b41b-b2e18299a2c0.europe-west3-0.gcp.cloud.qdrant.io:6333
 port = 443
 collection = allreports
@@ -17,18 +18,19 @@ final_k = 5
 [generator]
 PROVIDER = huggingface
 MODEL = meta-llama/Meta-Llama-3-8B-Instruct
-MAX_TOKENS = 1024
 TEMPERATURE = 0.1
 INFERENCE_PROVIDER = novita
 ORGANIZATION = GIZ
 CONTEXT_META_FIELDS = filename,project_id,document_source
 TITLE_META_FIELDS = filename,page
 [ingestor]
 # Size of each text chunk in characters
 chunk_size = 700
 # Overlap between consecutive chunks in characters
 chunk_overlap = 50
 # Text separators for splitting, comma-separated (order of preference)
 separators = \n\n,\n,. ,! ,? , ,

 # for native just give url
 mode = native
 url = https://de438521-e2dd-43d9-b41b-b2e18299a2c0.europe-west3-0.gcp.cloud.qdrant.io:6333
+# NOTE: The API key should be set via QDRANT_API_KEY environment variable.
 port = 443
 collection = allreports
 [generator]
 PROVIDER = huggingface
 MODEL = meta-llama/Meta-Llama-3-8B-Instruct
+MAX_TOKENS = 2048
 TEMPERATURE = 0.1
 INFERENCE_PROVIDER = novita
 ORGANIZATION = GIZ
 CONTEXT_META_FIELDS = filename,project_id,document_source
 TITLE_META_FIELDS = filename,page
 [ingestor]
 # Size of each text chunk in characters
 chunk_size = 700
 # Overlap between consecutive chunks in characters
 chunk_overlap = 50
+# Maximum number of chunks to send to LLM (prevents context overflow)
+max_chunks = 20
 # Text separators for splitting, comma-separated (order of preference)
 separators = \n\n,\n,. ,! ,? , ,

src/components/ingestor/ingestor.py CHANGED Viewed

@@ -64,6 +64,7 @@ def clean_and_chunk_text(text: str, config) -> str:
     # Get chunking parameters from config
     chunk_size = config.getint('ingestor', 'chunk_size', fallback=700)
     chunk_overlap = config.getint('ingestor', 'chunk_overlap', fallback=50)
     separators_str = config.get('ingestor', 'separators', fallback=r'\n\n,\n,. ,! ,? , ,')
     separators = [s.strip() for s in separators_str.split(',')]
@@ -78,9 +79,14 @@ def clean_and_chunk_text(text: str, config) -> str:
     chunks = text_splitter.split_text(text)
     # Create formatted context with chunk markers
     context_parts = []
-    for i, chunk_text in enumerate(chunks):
         context_parts.append(f"[Chunk {i+1}]: {chunk_text}")
     return "\n\n".join(context_parts)

     # Get chunking parameters from config
     chunk_size = config.getint('ingestor', 'chunk_size', fallback=700)
     chunk_overlap = config.getint('ingestor', 'chunk_overlap', fallback=50)
+    max_chunks = config.getint('ingestor', 'max_chunks', fallback=20)  # Limit chunks sent to LLM
     separators_str = config.get('ingestor', 'separators', fallback=r'\n\n,\n,. ,! ,? , ,')
     separators = [s.strip() for s in separators_str.split(',')]
     chunks = text_splitter.split_text(text)
+    # Limit the number of chunks to prevent context overflow
+    chunks_to_use = chunks[:max_chunks]
+    if len(chunks) > max_chunks:
+        logger.warning(f"Document has {len(chunks)} chunks, limiting to first {max_chunks} chunks")
     # Create formatted context with chunk markers
     context_parts = []
+    for i, chunk_text in enumerate(chunks_to_use):
         context_parts.append(f"[Chunk {i+1}]: {chunk_text}")
     return "\n\n".join(context_parts)