mtyrrell commited on
Commit
5c394c9
·
1 Parent(s): 5b77e39

ingestor fix

Browse files
params.cfg CHANGED
@@ -7,6 +7,7 @@ reranker_endpoint_url = https://whikfgijnuog8fjv.eu-west-1.aws.endpoints.hugging
7
  # for native just give url
8
  mode = native
9
  url = https://de438521-e2dd-43d9-b41b-b2e18299a2c0.europe-west3-0.gcp.cloud.qdrant.io:6333
 
10
  port = 443
11
  collection = allreports
12
 
@@ -17,18 +18,19 @@ final_k = 5
17
  [generator]
18
  PROVIDER = huggingface
19
  MODEL = meta-llama/Meta-Llama-3-8B-Instruct
20
- MAX_TOKENS = 1024
21
  TEMPERATURE = 0.1
22
  INFERENCE_PROVIDER = novita
23
  ORGANIZATION = GIZ
24
  CONTEXT_META_FIELDS = filename,project_id,document_source
25
  TITLE_META_FIELDS = filename,page
26
 
27
-
28
  [ingestor]
29
  # Size of each text chunk in characters
30
  chunk_size = 700
31
  # Overlap between consecutive chunks in characters
32
  chunk_overlap = 50
 
 
33
  # Text separators for splitting, comma-separated (order of preference)
34
  separators = \n\n,\n,. ,! ,? , ,
 
7
  # for native just give url
8
  mode = native
9
  url = https://de438521-e2dd-43d9-b41b-b2e18299a2c0.europe-west3-0.gcp.cloud.qdrant.io:6333
10
+ # NOTE: The API key should be set via QDRANT_API_KEY environment variable.
11
  port = 443
12
  collection = allreports
13
 
 
18
  [generator]
19
  PROVIDER = huggingface
20
  MODEL = meta-llama/Meta-Llama-3-8B-Instruct
21
+ MAX_TOKENS = 2048
22
  TEMPERATURE = 0.1
23
  INFERENCE_PROVIDER = novita
24
  ORGANIZATION = GIZ
25
  CONTEXT_META_FIELDS = filename,project_id,document_source
26
  TITLE_META_FIELDS = filename,page
27
 
 
28
  [ingestor]
29
  # Size of each text chunk in characters
30
  chunk_size = 700
31
  # Overlap between consecutive chunks in characters
32
  chunk_overlap = 50
33
+ # Maximum number of chunks to send to LLM (prevents context overflow)
34
+ max_chunks = 20
35
  # Text separators for splitting, comma-separated (order of preference)
36
  separators = \n\n,\n,. ,! ,? , ,
src/components/ingestor/ingestor.py CHANGED
@@ -64,6 +64,7 @@ def clean_and_chunk_text(text: str, config) -> str:
64
  # Get chunking parameters from config
65
  chunk_size = config.getint('ingestor', 'chunk_size', fallback=700)
66
  chunk_overlap = config.getint('ingestor', 'chunk_overlap', fallback=50)
 
67
  separators_str = config.get('ingestor', 'separators', fallback=r'\n\n,\n,. ,! ,? , ,')
68
  separators = [s.strip() for s in separators_str.split(',')]
69
 
@@ -78,9 +79,14 @@ def clean_and_chunk_text(text: str, config) -> str:
78
 
79
  chunks = text_splitter.split_text(text)
80
 
 
 
 
 
 
81
  # Create formatted context with chunk markers
82
  context_parts = []
83
- for i, chunk_text in enumerate(chunks):
84
  context_parts.append(f"[Chunk {i+1}]: {chunk_text}")
85
 
86
  return "\n\n".join(context_parts)
 
64
  # Get chunking parameters from config
65
  chunk_size = config.getint('ingestor', 'chunk_size', fallback=700)
66
  chunk_overlap = config.getint('ingestor', 'chunk_overlap', fallback=50)
67
+ max_chunks = config.getint('ingestor', 'max_chunks', fallback=20) # Limit chunks sent to LLM
68
  separators_str = config.get('ingestor', 'separators', fallback=r'\n\n,\n,. ,! ,? , ,')
69
  separators = [s.strip() for s in separators_str.split(',')]
70
 
 
79
 
80
  chunks = text_splitter.split_text(text)
81
 
82
+ # Limit the number of chunks to prevent context overflow
83
+ chunks_to_use = chunks[:max_chunks]
84
+ if len(chunks) > max_chunks:
85
+ logger.warning(f"Document has {len(chunks)} chunks, limiting to first {max_chunks} chunks")
86
+
87
  # Create formatted context with chunk markers
88
  context_parts = []
89
+ for i, chunk_text in enumerate(chunks_to_use):
90
  context_parts.append(f"[Chunk {i+1}]: {chunk_text}")
91
 
92
  return "\n\n".join(context_parts)