Spaces:
Sleeping
Sleeping
Commit
·
abdb242
1
Parent(s):
bfd4369
fixing chunk size + overlap
Browse files- app.py +11 -0
- config_1.py +1 -1
app.py
CHANGED
|
@@ -40,6 +40,8 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 40 |
try:
|
| 41 |
log_message("Инициализация системы")
|
| 42 |
os.makedirs(download_dir, exist_ok=True)
|
|
|
|
|
|
|
| 43 |
|
| 44 |
embed_model = get_embedding_model()
|
| 45 |
llm = get_llm_model(DEFAULT_MODEL)
|
|
@@ -47,6 +49,15 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 47 |
|
| 48 |
Settings.embed_model = embed_model
|
| 49 |
Settings.llm = llm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
all_documents = []
|
| 52 |
chunks_df = None
|
|
|
|
| 40 |
try:
|
| 41 |
log_message("Инициализация системы")
|
| 42 |
os.makedirs(download_dir, exist_ok=True)
|
| 43 |
+
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 44 |
+
from llama_index.core.text_splitter import SentenceSplitter
|
| 45 |
|
| 46 |
embed_model = get_embedding_model()
|
| 47 |
llm = get_llm_model(DEFAULT_MODEL)
|
|
|
|
| 49 |
|
| 50 |
Settings.embed_model = embed_model
|
| 51 |
Settings.llm = llm
|
| 52 |
+
Settings.text_splitter = SentenceSplitter(
|
| 53 |
+
chunk_size=CHUNK_SIZE,
|
| 54 |
+
chunk_overlap=CHUNK_OVERLAP,
|
| 55 |
+
separator=" "
|
| 56 |
+
)
|
| 57 |
+
# Add this after setting Settings
|
| 58 |
+
log_message(f"Configured chunk size: {CHUNK_SIZE}")
|
| 59 |
+
log_message(f"Configured chunk overlap: {CHUNK_OVERLAP}")
|
| 60 |
+
log_message(f"Settings text splitter chunk size: {Settings.text_splitter.chunk_size if hasattr(Settings, 'text_splitter') else 'Not set'}")
|
| 61 |
|
| 62 |
all_documents = []
|
| 63 |
chunks_df = None
|
config_1.py
CHANGED
|
@@ -16,7 +16,7 @@ download_dir = "rag_files"
|
|
| 16 |
HF_TOKEN = os.getenv('HF_TOKEN')
|
| 17 |
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
| 18 |
|
| 19 |
-
CHUNK_SIZE =
|
| 20 |
CHUNK_OVERLAP = 256
|
| 21 |
|
| 22 |
|
|
|
|
| 16 |
HF_TOKEN = os.getenv('HF_TOKEN')
|
| 17 |
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
| 18 |
|
| 19 |
+
CHUNK_SIZE = 2048
|
| 20 |
CHUNK_OVERLAP = 256
|
| 21 |
|
| 22 |
|