MrSimple07 commited on
Commit
3f09b3e
·
1 Parent(s): 499b5c3

process_documents_with_chunking improvement

Browse files
Files changed (3) hide show
  1. app.py +15 -5
  2. document_processor.py +0 -263
  3. documents_prep.py +68 -30
app.py CHANGED
@@ -96,6 +96,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
96
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
97
  use_json_instead_csv=False):
98
  try:
 
99
  log_message("Инициализация системы")
100
  os.makedirs(download_dir, exist_ok=True)
101
  from config import CHUNK_SIZE, CHUNK_OVERLAP
@@ -112,10 +113,9 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
112
  chunk_overlap=CHUNK_OVERLAP,
113
  separator=" "
114
  )
115
- # Add this after setting Settings
116
  log_message(f"Configured chunk size: {CHUNK_SIZE}")
117
  log_message(f"Configured chunk overlap: {CHUNK_OVERLAP}")
118
- log_message(f"Settings text splitter chunk size: {Settings.text_splitter.chunk_size if hasattr(Settings, 'text_splitter') else 'Not set'}")
119
 
120
  all_documents = []
121
  chunks_df = None
@@ -135,14 +135,24 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
135
  if table_data_dir:
136
  log_message("Добавляю табличные данные")
137
  table_documents = load_table_data(repo_id, hf_token, table_data_dir)
138
- all_documents.extend(table_documents)
 
 
 
 
 
139
 
140
  if image_data_dir:
141
  log_message("Добавляю данные изображений")
142
  image_documents = load_image_data(repo_id, hf_token, image_data_dir)
143
- all_documents.extend(image_documents)
 
 
 
 
 
144
 
145
- log_message(f"Всего документов: {len(all_documents)}")
146
 
147
  vector_index = create_vector_index(all_documents)
148
  query_engine = create_query_engine(vector_index)
 
96
  json_files_dir=None, table_data_dir=None, image_data_dir=None,
97
  use_json_instead_csv=False):
98
  try:
99
+ from documents_prep import process_documents_with_chunking
100
  log_message("Инициализация системы")
101
  os.makedirs(download_dir, exist_ok=True)
102
  from config import CHUNK_SIZE, CHUNK_OVERLAP
 
113
  chunk_overlap=CHUNK_OVERLAP,
114
  separator=" "
115
  )
116
+
117
  log_message(f"Configured chunk size: {CHUNK_SIZE}")
118
  log_message(f"Configured chunk overlap: {CHUNK_OVERLAP}")
 
119
 
120
  all_documents = []
121
  chunks_df = None
 
135
  if table_data_dir:
136
  log_message("Добавляю табличные данные")
137
  table_documents = load_table_data(repo_id, hf_token, table_data_dir)
138
+ log_message(f"Загружено {len(table_documents)} табличных документов")
139
+
140
+ # Process table documents through chunking
141
+ chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
142
+ all_documents.extend(chunked_table_docs)
143
+ chunk_info.extend(table_chunk_info)
144
 
145
  if image_data_dir:
146
  log_message("Добавляю данные изображений")
147
  image_documents = load_image_data(repo_id, hf_token, image_data_dir)
148
+ log_message(f"Загружено {len(image_documents)} документов изображений")
149
+
150
+ # Process image documents through chunking
151
+ chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
152
+ all_documents.extend(chunked_image_docs)
153
+ chunk_info.extend(image_chunk_info)
154
 
155
+ log_message(f"Всего документов после всей обработки: {len(all_documents)}")
156
 
157
  vector_index = create_vector_index(all_documents)
158
  query_engine = create_query_engine(vector_index)
document_processor.py DELETED
@@ -1,263 +0,0 @@
1
- import os
2
- import fitz
3
- import pandas as pd
4
- from pathlib import Path
5
- from llama_index.core import Document, VectorStoreIndex
6
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
- from llama_index.core.query_engine import RetrieverQueryEngine
8
- from llama_index.core.retrievers import VectorIndexRetriever
9
- from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
10
- from llama_index.core.prompts import PromptTemplate
11
- from config import *
12
- import shutil
13
- import faiss
14
- from huggingface_hub import hf_hub_download
15
-
16
-
17
- def log_message(message):
18
- print(message, flush=True)
19
-
20
- def extract_text_from_pdf(file_path):
21
- doc = fitz.open(file_path)
22
- text = ""
23
- for page in doc:
24
- text += page.get_text()
25
- doc.close()
26
- return text
27
-
28
- def extract_text_from_txt(file_path):
29
- with open(file_path, 'r', encoding='utf-8') as file:
30
- return file.read()
31
-
32
- def chunk_text(text, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
33
- log_message(f"📄 Chunking text into pieces of {chunk_size} characters...")
34
- chunks = []
35
- start = 0
36
- while start < len(text):
37
- end = start + chunk_size
38
- chunk = text[start:end]
39
- chunks.append(chunk)
40
- start = end - chunk_overlap
41
- log_message(f"✅ Created {len(chunks)} chunks")
42
- return chunks
43
-
44
- def process_uploaded_file(file_path, file_name, doc_name, doc_link):
45
- try:
46
- log_message(f"🔄 Processing file: {file_name}")
47
-
48
- # Create upload directory if it doesn't exist
49
- upload_dir = "UPLOADED_DOCUMENTS"
50
- os.makedirs(upload_dir, exist_ok=True)
51
-
52
- # Copy uploaded file to permanent location
53
- permanent_file_path = os.path.join(upload_dir, file_name)
54
- if os.path.abspath(file_path) != os.path.abspath(permanent_file_path):
55
- shutil.copy2(file_path, permanent_file_path)
56
- log_message(f"📁 File saved to: {permanent_file_path}")
57
-
58
- file_extension = Path(file_path).suffix.lower()
59
-
60
- if file_extension == '.pdf':
61
- log_message("📖 Extracting text from PDF...")
62
- text = extract_text_from_pdf(file_path)
63
- elif file_extension == '.txt':
64
- log_message("📝 Reading text file...")
65
- text = extract_text_from_txt(file_path)
66
- else:
67
- return None, "Unsupported file type"
68
-
69
- word_count = len(text.split())
70
- log_message(f"📊 Extracted {word_count} words from document")
71
-
72
- chunks = chunk_text(text)
73
-
74
- return {
75
- 'document': doc_name,
76
- 'file_name': file_name,
77
- 'doc_link': doc_link,
78
- 'total_words': word_count,
79
- 'extracted_text': text,
80
- 'chunks': chunks
81
- }, None
82
-
83
- except Exception as e:
84
- log_message(f"❌ Error processing file: {str(e)}")
85
- return None, str(e)
86
-
87
- def get_existing_documents():
88
- try:
89
- # First check CSV file for processed documents
90
- chunks_csv_path = os.path.join(download_dir, chunks_filename)
91
- if os.path.exists(chunks_csv_path):
92
- chunks_df = pd.read_csv(chunks_csv_path)
93
- if not chunks_df.empty and 'document_name' in chunks_df.columns:
94
- unique_docs = chunks_df['document_name'].unique()
95
- return sorted([doc for doc in unique_docs if pd.notna(doc)])
96
-
97
- # Fallback to checking uploaded files directory
98
- upload_dir = "UPLOADED_DOCUMENTS"
99
- if os.path.exists(upload_dir):
100
- documents = []
101
- for file_name in os.listdir(upload_dir):
102
- if file_name.endswith(('.txt', '.pdf')):
103
- doc_name = os.path.splitext(file_name)[0]
104
- documents.append(doc_name)
105
- return sorted(documents)
106
-
107
- return []
108
- except Exception as e:
109
- log_message(f"❌ Error reading documents: {str(e)}")
110
- return []
111
-
112
- def add_to_vector_index(new_chunks, file_info, existing_chunks_df=None):
113
- try:
114
- log_message("🔧 Setting up embedding model...")
115
- embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
116
-
117
- log_message("📝 Creating document objects...")
118
- new_documents = []
119
- new_chunk_data = []
120
-
121
- for i, chunk in enumerate(new_chunks):
122
- doc_id = f"{file_info['file_name']}_{i}"
123
- new_documents.append(Document(
124
- text=chunk,
125
- metadata={
126
- "chunk_id": doc_id,
127
- "document_id": file_info['file_name'],
128
- "document_name": file_info['document'],
129
- "document_link": file_info['doc_link']
130
- }
131
- ))
132
- new_chunk_data.append({
133
- 'chunk_id': doc_id,
134
- 'document_id': file_info['file_name'],
135
- 'document_name': file_info['document'],
136
- 'document_link': file_info['doc_link'],
137
- 'chunk_text': chunk
138
- })
139
-
140
- if existing_chunks_df is not None:
141
- log_message("🔄 Merging with existing chunks...")
142
- new_chunks_df = pd.DataFrame(new_chunk_data)
143
- chunks_df = pd.concat([existing_chunks_df, new_chunks_df], ignore_index=True)
144
- else:
145
- chunks_df = pd.DataFrame(new_chunk_data)
146
-
147
- log_message("🏗️ Building vector index...")
148
- all_documents = [Document(text=str(row['chunk_text']),
149
- metadata={
150
- "chunk_id": row['chunk_id'],
151
- "document_id": row['document_id'],
152
- "document_name": row['document_name'],
153
- "document_link": row['document_link']
154
- })
155
- for _, row in chunks_df.iterrows()]
156
-
157
- vector_index = VectorStoreIndex.from_documents(all_documents, embed_model=embed_model)
158
-
159
- log_message("🔍 Setting up retriever...")
160
- retriever = VectorIndexRetriever(
161
- index=vector_index,
162
- similarity_top_k=RETRIEVER_TOP_K,
163
- similarity_cutoff=SIMILARITY_THRESHOLD
164
- )
165
-
166
- log_message("🎯 Configuring response synthesizer...")
167
- custom_prompt_template = PromptTemplate(CUSTOM_PROMPT_NEW)
168
- response_synthesizer = get_response_synthesizer(
169
- response_mode=ResponseMode.TREE_SUMMARIZE,
170
- text_qa_template=custom_prompt_template
171
- )
172
-
173
- query_engine = RetrieverQueryEngine(
174
- retriever=retriever,
175
- response_synthesizer=response_synthesizer
176
- )
177
-
178
- log_message("💾 Saving chunks to file...")
179
- os.makedirs(download_dir, exist_ok=True)
180
- chunks_df.to_csv(os.path.join(download_dir, chunks_filename), index=False)
181
-
182
- log_message("✅ Successfully added document to vector index")
183
- return query_engine, chunks_df, None
184
-
185
- except Exception as e:
186
- log_message(f"❌ Error adding to vector index: {str(e)}")
187
- return None, existing_chunks_df, str(e)
188
-
189
- def initialize_system():
190
- global query_engine, chunks_df
191
-
192
- try:
193
- log_message("🔄 Initializing system...")
194
- os.makedirs(download_dir, exist_ok=True)
195
-
196
- log_message("📥 Loading files...")
197
- faiss_index_path = hf_hub_download(
198
- repo_id=REPO_ID,
199
- filename=faiss_index_filename,
200
- local_dir=download_dir,
201
- repo_type="dataset",
202
- token=HF_TOKEN
203
- )
204
-
205
- chunks_csv_path = hf_hub_download(
206
- repo_id=REPO_ID,
207
- filename=chunks_filename,
208
- local_dir=download_dir,
209
- repo_type="dataset",
210
- token=HF_TOKEN
211
- )
212
-
213
- log_message("📚 Loading index and data...")
214
- index_faiss = faiss.read_index(faiss_index_path)
215
- chunks_df = pd.read_csv(chunks_csv_path)
216
-
217
- log_message("🤖 Setting up models...")
218
- embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
219
-
220
- text_column = None
221
- for col in chunks_df.columns:
222
- if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
223
- text_column = col
224
- break
225
-
226
- if text_column is None:
227
- text_column = chunks_df.columns[0]
228
-
229
- log_message("📝 Creating documents...")
230
- documents = [Document(text=str(row[text_column]),
231
- metadata={"chunk_id": row.get('chunk_id', i),
232
- "document_id": row.get('document_id', 'unknown'),
233
- "document_name": row.get('document_name', 'unknown'),
234
- "document_link": row.get('document_link', '')})
235
- for i, (_, row) in enumerate(chunks_df.iterrows())]
236
-
237
- log_message("🔍 Building vector index...")
238
- vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
239
-
240
- retriever = VectorIndexRetriever(
241
- index=vector_index,
242
- similarity_top_k=RETRIEVER_TOP_K,
243
- similarity_cutoff=SIMILARITY_THRESHOLD
244
- )
245
-
246
- custom_prompt_template = PromptTemplate(CUSTOM_PROMPT)
247
- response_synthesizer = get_response_synthesizer(
248
- response_mode=ResponseMode.TREE_SUMMARIZE,
249
- text_qa_template=custom_prompt_template
250
- )
251
-
252
- query_engine = RetrieverQueryEngine(
253
- retriever=retriever,
254
- response_synthesizer=response_synthesizer
255
- )
256
-
257
- log_message("✅ System successfully initialized!")
258
- return query_engine, chunks_df, True
259
-
260
- except Exception as e:
261
- log_message(f"❌ Initialization error: {str(e)}")
262
- chunks_df = pd.DataFrame(columns=['chunk_id', 'document_id', 'document_name', 'document_link', 'chunk_text'])
263
- return None, chunks_df, False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
documents_prep.py CHANGED
@@ -54,38 +54,73 @@ def process_documents_with_chunking(documents):
54
 
55
  if doc_type == 'table':
56
  table_count += 1
57
- if len(doc.text) > CHUNK_SIZE:
 
58
  large_tables_count += 1
59
- log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {len(doc.text)} characters")
60
-
61
- all_chunked_docs.append(doc)
62
- chunk_info.append({
63
- 'document_id': doc.metadata.get('document_id', 'unknown'),
64
- 'section_id': doc.metadata.get('section_id', 'unknown'),
65
- 'chunk_id': 0,
66
- 'chunk_size': len(doc.text),
67
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
68
- 'type': 'table'
69
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  elif doc_type == 'image':
72
  image_count += 1
73
- if len(doc.text) > CHUNK_SIZE:
 
74
  large_images_count += 1
75
- log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {len(doc.text)} characters")
76
-
77
- all_chunked_docs.append(doc)
78
- chunk_info.append({
79
- 'document_id': doc.metadata.get('document_id', 'unknown'),
80
- 'section_id': doc.metadata.get('section_id', 'unknown'),
81
- 'chunk_id': 0,
82
- 'chunk_size': len(doc.text),
83
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
84
- 'type': 'image'
85
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- else:
88
- if len(doc.text) > CHUNK_SIZE:
 
89
  chunked_docs = chunk_document(doc)
90
  all_chunked_docs.extend(chunked_docs)
91
  text_chunks_count += len(chunked_docs)
@@ -105,7 +140,7 @@ def process_documents_with_chunking(documents):
105
  'document_id': doc.metadata.get('document_id', 'unknown'),
106
  'section_id': doc.metadata.get('section_id', 'unknown'),
107
  'chunk_id': 0,
108
- 'chunk_size': len(doc.text),
109
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
110
  'type': 'text'
111
  })
@@ -120,6 +155,7 @@ def process_documents_with_chunking(documents):
120
 
121
  return all_chunked_docs, chunk_info
122
 
 
123
  def extract_text_from_json(data, document_id, document_name):
124
  documents = []
125
 
@@ -244,6 +280,7 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
244
 
245
  documents = extract_zip_and_process_json(local_zip_path)
246
  all_documents.extend(documents)
 
247
 
248
  except Exception as e:
249
  log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
@@ -276,17 +313,18 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
276
  log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
277
  continue
278
 
 
 
 
279
  chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
280
 
281
- log_message(f"Всего создано {len(all_documents)} исходных документов")
282
- log_message(f"После chunking получено {len(chunked_documents)} чанков")
283
 
284
  return chunked_documents, chunk_info
285
 
286
  except Exception as e:
287
  log_message(f"Ошибка загрузки JSON документов: {str(e)}")
288
  return [], []
289
-
290
 
291
  def extract_section_title(section_text):
292
  if not section_text.strip():
 
54
 
55
  if doc_type == 'table':
56
  table_count += 1
57
+ doc_size = len(doc.text)
58
+ if doc_size > CHUNK_SIZE:
59
  large_tables_count += 1
60
+ log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
61
+
62
+ # Chunk large tables
63
+ chunked_docs = chunk_document(doc)
64
+ all_chunked_docs.extend(chunked_docs)
65
+
66
+ for i, chunk_doc in enumerate(chunked_docs):
67
+ chunk_info.append({
68
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
69
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
70
+ 'chunk_id': i,
71
+ 'chunk_size': len(chunk_doc.text),
72
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
73
+ 'type': 'table',
74
+ 'table_number': chunk_doc.metadata.get('table_number', 'unknown')
75
+ })
76
+ else:
77
+ all_chunked_docs.append(doc)
78
+ chunk_info.append({
79
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
80
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
81
+ 'chunk_id': 0,
82
+ 'chunk_size': doc_size,
83
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
84
+ 'type': 'table',
85
+ 'table_number': doc.metadata.get('table_number', 'unknown')
86
+ })
87
 
88
  elif doc_type == 'image':
89
  image_count += 1
90
+ doc_size = len(doc.text)
91
+ if doc_size > CHUNK_SIZE:
92
  large_images_count += 1
93
+ log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
94
+
95
+ # Chunk large images
96
+ chunked_docs = chunk_document(doc)
97
+ all_chunked_docs.extend(chunked_docs)
98
+
99
+ for i, chunk_doc in enumerate(chunked_docs):
100
+ chunk_info.append({
101
+ 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
102
+ 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
103
+ 'chunk_id': i,
104
+ 'chunk_size': len(chunk_doc.text),
105
+ 'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
106
+ 'type': 'image',
107
+ 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
108
+ })
109
+ else:
110
+ all_chunked_docs.append(doc)
111
+ chunk_info.append({
112
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
113
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
114
+ 'chunk_id': 0,
115
+ 'chunk_size': doc_size,
116
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
117
+ 'type': 'image',
118
+ 'image_number': doc.metadata.get('image_number', 'unknown')
119
+ })
120
 
121
+ else: # text documents
122
+ doc_size = len(doc.text)
123
+ if doc_size > CHUNK_SIZE:
124
  chunked_docs = chunk_document(doc)
125
  all_chunked_docs.extend(chunked_docs)
126
  text_chunks_count += len(chunked_docs)
 
140
  'document_id': doc.metadata.get('document_id', 'unknown'),
141
  'section_id': doc.metadata.get('section_id', 'unknown'),
142
  'chunk_id': 0,
143
+ 'chunk_size': doc_size,
144
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
145
  'type': 'text'
146
  })
 
155
 
156
  return all_chunked_docs, chunk_info
157
 
158
+
159
  def extract_text_from_json(data, document_id, document_name):
160
  documents = []
161
 
 
280
 
281
  documents = extract_zip_and_process_json(local_zip_path)
282
  all_documents.extend(documents)
283
+ log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
284
 
285
  except Exception as e:
286
  log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
 
313
  log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
314
  continue
315
 
316
+ log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
317
+
318
+ # Process documents through chunking function
319
  chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
320
 
321
+ log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
 
322
 
323
  return chunked_documents, chunk_info
324
 
325
  except Exception as e:
326
  log_message(f"Ошибка загрузки JSON документов: {str(e)}")
327
  return [], []
 
328
 
329
  def extract_section_title(section_text):
330
  if not section_text.strip():