Spaces:
Sleeping
Sleeping
ingestor fix
Browse files- params.cfg +4 -2
- src/components/ingestor/ingestor.py +7 -1
params.cfg
CHANGED
|
@@ -7,6 +7,7 @@ reranker_endpoint_url = https://whikfgijnuog8fjv.eu-west-1.aws.endpoints.hugging
|
|
| 7 |
# for native just give url
|
| 8 |
mode = native
|
| 9 |
url = https://de438521-e2dd-43d9-b41b-b2e18299a2c0.europe-west3-0.gcp.cloud.qdrant.io:6333
|
|
|
|
| 10 |
port = 443
|
| 11 |
collection = allreports
|
| 12 |
|
|
@@ -17,18 +18,19 @@ final_k = 5
|
|
| 17 |
[generator]
|
| 18 |
PROVIDER = huggingface
|
| 19 |
MODEL = meta-llama/Meta-Llama-3-8B-Instruct
|
| 20 |
-
MAX_TOKENS =
|
| 21 |
TEMPERATURE = 0.1
|
| 22 |
INFERENCE_PROVIDER = novita
|
| 23 |
ORGANIZATION = GIZ
|
| 24 |
CONTEXT_META_FIELDS = filename,project_id,document_source
|
| 25 |
TITLE_META_FIELDS = filename,page
|
| 26 |
|
| 27 |
-
|
| 28 |
[ingestor]
|
| 29 |
# Size of each text chunk in characters
|
| 30 |
chunk_size = 700
|
| 31 |
# Overlap between consecutive chunks in characters
|
| 32 |
chunk_overlap = 50
|
|
|
|
|
|
|
| 33 |
# Text separators for splitting, comma-separated (order of preference)
|
| 34 |
separators = \n\n,\n,. ,! ,? , ,
|
|
|
|
| 7 |
# for native just give url
|
| 8 |
mode = native
|
| 9 |
url = https://de438521-e2dd-43d9-b41b-b2e18299a2c0.europe-west3-0.gcp.cloud.qdrant.io:6333
|
| 10 |
+
# NOTE: The API key should be set via QDRANT_API_KEY environment variable.
|
| 11 |
port = 443
|
| 12 |
collection = allreports
|
| 13 |
|
|
|
|
| 18 |
[generator]
|
| 19 |
PROVIDER = huggingface
|
| 20 |
MODEL = meta-llama/Meta-Llama-3-8B-Instruct
|
| 21 |
+
MAX_TOKENS = 2048
|
| 22 |
TEMPERATURE = 0.1
|
| 23 |
INFERENCE_PROVIDER = novita
|
| 24 |
ORGANIZATION = GIZ
|
| 25 |
CONTEXT_META_FIELDS = filename,project_id,document_source
|
| 26 |
TITLE_META_FIELDS = filename,page
|
| 27 |
|
|
|
|
| 28 |
[ingestor]
|
| 29 |
# Size of each text chunk in characters
|
| 30 |
chunk_size = 700
|
| 31 |
# Overlap between consecutive chunks in characters
|
| 32 |
chunk_overlap = 50
|
| 33 |
+
# Maximum number of chunks to send to LLM (prevents context overflow)
|
| 34 |
+
max_chunks = 20
|
| 35 |
# Text separators for splitting, comma-separated (order of preference)
|
| 36 |
separators = \n\n,\n,. ,! ,? , ,
|
src/components/ingestor/ingestor.py
CHANGED
|
@@ -64,6 +64,7 @@ def clean_and_chunk_text(text: str, config) -> str:
|
|
| 64 |
# Get chunking parameters from config
|
| 65 |
chunk_size = config.getint('ingestor', 'chunk_size', fallback=700)
|
| 66 |
chunk_overlap = config.getint('ingestor', 'chunk_overlap', fallback=50)
|
|
|
|
| 67 |
separators_str = config.get('ingestor', 'separators', fallback=r'\n\n,\n,. ,! ,? , ,')
|
| 68 |
separators = [s.strip() for s in separators_str.split(',')]
|
| 69 |
|
|
@@ -78,9 +79,14 @@ def clean_and_chunk_text(text: str, config) -> str:
|
|
| 78 |
|
| 79 |
chunks = text_splitter.split_text(text)
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# Create formatted context with chunk markers
|
| 82 |
context_parts = []
|
| 83 |
-
for i, chunk_text in enumerate(
|
| 84 |
context_parts.append(f"[Chunk {i+1}]: {chunk_text}")
|
| 85 |
|
| 86 |
return "\n\n".join(context_parts)
|
|
|
|
| 64 |
# Get chunking parameters from config
|
| 65 |
chunk_size = config.getint('ingestor', 'chunk_size', fallback=700)
|
| 66 |
chunk_overlap = config.getint('ingestor', 'chunk_overlap', fallback=50)
|
| 67 |
+
max_chunks = config.getint('ingestor', 'max_chunks', fallback=20) # Limit chunks sent to LLM
|
| 68 |
separators_str = config.get('ingestor', 'separators', fallback=r'\n\n,\n,. ,! ,? , ,')
|
| 69 |
separators = [s.strip() for s in separators_str.split(',')]
|
| 70 |
|
|
|
|
| 79 |
|
| 80 |
chunks = text_splitter.split_text(text)
|
| 81 |
|
| 82 |
+
# Limit the number of chunks to prevent context overflow
|
| 83 |
+
chunks_to_use = chunks[:max_chunks]
|
| 84 |
+
if len(chunks) > max_chunks:
|
| 85 |
+
logger.warning(f"Document has {len(chunks)} chunks, limiting to first {max_chunks} chunks")
|
| 86 |
+
|
| 87 |
# Create formatted context with chunk markers
|
| 88 |
context_parts = []
|
| 89 |
+
for i, chunk_text in enumerate(chunks_to_use):
|
| 90 |
context_parts.append(f"[Chunk {i+1}]: {chunk_text}")
|
| 91 |
|
| 92 |
return "\n\n".join(context_parts)
|