Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 18, 2025

Commit

3f09b3e

1 Parent(s): 499b5c3

process_documents_with_chunking improvement

Browse files

Files changed (3) hide show

app.py +15 -5
document_processor.py +0 -263
documents_prep.py +68 -30

app.py CHANGED Viewed

@@ -96,6 +96,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                      use_json_instead_csv=False):
     try:
         log_message("Инициализация системы")
         os.makedirs(download_dir, exist_ok=True)
         from config import CHUNK_SIZE, CHUNK_OVERLAP
@@ -112,10 +113,9 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
             chunk_overlap=CHUNK_OVERLAP,
             separator=" "
         )
-        # Add this after setting Settings
         log_message(f"Configured chunk size: {CHUNK_SIZE}")
         log_message(f"Configured chunk overlap: {CHUNK_OVERLAP}")
-        log_message(f"Settings text splitter chunk size: {Settings.text_splitter.chunk_size if hasattr(Settings, 'text_splitter') else 'Not set'}")
         all_documents = []
         chunks_df = None
@@ -135,14 +135,24 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
         if table_data_dir:
             log_message("Добавляю табличные данные")
             table_documents = load_table_data(repo_id, hf_token, table_data_dir)
-            all_documents.extend(table_documents)
         if image_data_dir:
             log_message("Добавляю данные изображений")
             image_documents = load_image_data(repo_id, hf_token, image_data_dir)
-            all_documents.extend(image_documents)
-        log_message(f"Всего документов: {len(all_documents)}")
         vector_index = create_vector_index(all_documents)
         query_engine = create_query_engine(vector_index)

                      json_files_dir=None, table_data_dir=None, image_data_dir=None,
                      use_json_instead_csv=False):
     try:
+        from documents_prep import process_documents_with_chunking
         log_message("Инициализация системы")
         os.makedirs(download_dir, exist_ok=True)
         from config import CHUNK_SIZE, CHUNK_OVERLAP
             chunk_overlap=CHUNK_OVERLAP,
             separator=" "
         )
         log_message(f"Configured chunk size: {CHUNK_SIZE}")
         log_message(f"Configured chunk overlap: {CHUNK_OVERLAP}")
         all_documents = []
         chunks_df = None
         if table_data_dir:
             log_message("Добавляю табличные данные")
             table_documents = load_table_data(repo_id, hf_token, table_data_dir)
+            log_message(f"Загружено {len(table_documents)} табличных документов")
+            # Process table documents through chunking
+            chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
+            all_documents.extend(chunked_table_docs)
+            chunk_info.extend(table_chunk_info)
         if image_data_dir:
             log_message("Добавляю данные изображений")
             image_documents = load_image_data(repo_id, hf_token, image_data_dir)
+            log_message(f"Загружено {len(image_documents)} документов изображений")
+            # Process image documents through chunking
+            chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
+            all_documents.extend(chunked_image_docs)
+            chunk_info.extend(image_chunk_info)
+        log_message(f"Всего документов после всей обработки: {len(all_documents)}")
         vector_index = create_vector_index(all_documents)
         query_engine = create_query_engine(vector_index)

document_processor.py DELETED Viewed

@@ -1,263 +0,0 @@
-import os
-import fitz
-import pandas as pd
-from pathlib import Path
-from llama_index.core import Document, VectorStoreIndex
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.core.query_engine import RetrieverQueryEngine
-from llama_index.core.retrievers import VectorIndexRetriever
-from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
-from llama_index.core.prompts import PromptTemplate
-from config import *
-import shutil
-import faiss
-from huggingface_hub import hf_hub_download
-def log_message(message):
-    print(message, flush=True)
-def extract_text_from_pdf(file_path):
-    doc = fitz.open(file_path)
-    text = ""
-    for page in doc:
-        text += page.get_text()
-    doc.close()
-    return text
-def extract_text_from_txt(file_path):
-    with open(file_path, 'r', encoding='utf-8') as file:
-        return file.read()
-def chunk_text(text, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
-    log_message(f"📄 Chunking text into pieces of {chunk_size} characters...")
-    chunks = []
-    start = 0
-    while start < len(text):
-        end = start + chunk_size
-        chunk = text[start:end]
-        chunks.append(chunk)
-        start = end - chunk_overlap
-    log_message(f"✅ Created {len(chunks)} chunks")
-    return chunks
-def process_uploaded_file(file_path, file_name, doc_name, doc_link):
-    try:
-        log_message(f"🔄 Processing file: {file_name}")
-        # Create upload directory if it doesn't exist
-        upload_dir = "UPLOADED_DOCUMENTS"
-        os.makedirs(upload_dir, exist_ok=True)
-        # Copy uploaded file to permanent location
-        permanent_file_path = os.path.join(upload_dir, file_name)
-        if os.path.abspath(file_path) != os.path.abspath(permanent_file_path):
-            shutil.copy2(file_path, permanent_file_path)
-            log_message(f"📁 File saved to: {permanent_file_path}")
-        file_extension = Path(file_path).suffix.lower()
-        if file_extension == '.pdf':
-            log_message("📖 Extracting text from PDF...")
-            text = extract_text_from_pdf(file_path)
-        elif file_extension == '.txt':
-            log_message("📝 Reading text file...")
-            text = extract_text_from_txt(file_path)
-        else:
-            return None, "Unsupported file type"
-        word_count = len(text.split())
-        log_message(f"📊 Extracted {word_count} words from document")
-        chunks = chunk_text(text)
-        return {
-            'document': doc_name,
-            'file_name': file_name,
-            'doc_link': doc_link,
-            'total_words': word_count,
-            'extracted_text': text,
-            'chunks': chunks
-        }, None
-    except Exception as e:
-        log_message(f"❌ Error processing file: {str(e)}")
-        return None, str(e)
-def get_existing_documents():
-    try:
-        # First check CSV file for processed documents
-        chunks_csv_path = os.path.join(download_dir, chunks_filename)
-        if os.path.exists(chunks_csv_path):
-            chunks_df = pd.read_csv(chunks_csv_path)
-            if not chunks_df.empty and 'document_name' in chunks_df.columns:
-                unique_docs = chunks_df['document_name'].unique()
-                return sorted([doc for doc in unique_docs if pd.notna(doc)])
-        # Fallback to checking uploaded files directory
-        upload_dir = "UPLOADED_DOCUMENTS"
-        if os.path.exists(upload_dir):
-            documents = []
-            for file_name in os.listdir(upload_dir):
-                if file_name.endswith(('.txt', '.pdf')):
-                    doc_name = os.path.splitext(file_name)[0]
-                    documents.append(doc_name)
-            return sorted(documents)
-        return []
-    except Exception as e:
-        log_message(f"❌ Error reading documents: {str(e)}")
-        return []
-def add_to_vector_index(new_chunks, file_info, existing_chunks_df=None):
-    try:
-        log_message("🔧 Setting up embedding model...")
-        embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
-        log_message("📝 Creating document objects...")
-        new_documents = []
-        new_chunk_data = []
-        for i, chunk in enumerate(new_chunks):
-            doc_id = f"{file_info['file_name']}_{i}"
-            new_documents.append(Document(
-                text=chunk,
-                metadata={
-                    "chunk_id": doc_id,
-                    "document_id": file_info['file_name'],
-                    "document_name": file_info['document'],
-                    "document_link": file_info['doc_link']
-                }
-            ))
-            new_chunk_data.append({
-                'chunk_id': doc_id,
-                'document_id': file_info['file_name'],
-                'document_name': file_info['document'],
-                'document_link': file_info['doc_link'],
-                'chunk_text': chunk
-            })
-        if existing_chunks_df is not None:
-            log_message("🔄 Merging with existing chunks...")
-            new_chunks_df = pd.DataFrame(new_chunk_data)
-            chunks_df = pd.concat([existing_chunks_df, new_chunks_df], ignore_index=True)
-        else:
-            chunks_df = pd.DataFrame(new_chunk_data)
-        log_message("🏗️ Building vector index...")
-        all_documents = [Document(text=str(row['chunk_text']),
-                                metadata={
-                                    "chunk_id": row['chunk_id'],
-                                    "document_id": row['document_id'],
-                                    "document_name": row['document_name'],
-                                    "document_link": row['document_link']
-                                })
-                        for _, row in chunks_df.iterrows()]
-        vector_index = VectorStoreIndex.from_documents(all_documents, embed_model=embed_model)
-        log_message("🔍 Setting up retriever...")
-        retriever = VectorIndexRetriever(
-            index=vector_index,
-            similarity_top_k=RETRIEVER_TOP_K,
-            similarity_cutoff=SIMILARITY_THRESHOLD
-        )
-        log_message("🎯 Configuring response synthesizer...")
-        custom_prompt_template = PromptTemplate(CUSTOM_PROMPT_NEW)
-        response_synthesizer = get_response_synthesizer(
-            response_mode=ResponseMode.TREE_SUMMARIZE,
-            text_qa_template=custom_prompt_template
-        )
-        query_engine = RetrieverQueryEngine(
-            retriever=retriever,
-            response_synthesizer=response_synthesizer
-        )
-        log_message("💾 Saving chunks to file...")
-        os.makedirs(download_dir, exist_ok=True)
-        chunks_df.to_csv(os.path.join(download_dir, chunks_filename), index=False)
-        log_message("✅ Successfully added document to vector index")
-        return query_engine, chunks_df, None
-    except Exception as e:
-        log_message(f"❌ Error adding to vector index: {str(e)}")
-        return None, existing_chunks_df, str(e)
-def initialize_system():
-    global query_engine, chunks_df
-    try:
-        log_message("🔄 Initializing system...")
-        os.makedirs(download_dir, exist_ok=True)
-        log_message("📥 Loading files...")
-        faiss_index_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=faiss_index_filename,
-            local_dir=download_dir,
-            repo_type="dataset",
-            token=HF_TOKEN
-        )
-        chunks_csv_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=chunks_filename,
-            local_dir=download_dir,
-            repo_type="dataset",
-            token=HF_TOKEN
-        )
-        log_message("📚 Loading index and data...")
-        index_faiss = faiss.read_index(faiss_index_path)
-        chunks_df = pd.read_csv(chunks_csv_path)
-        log_message("🤖 Setting up models...")
-        embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
-        text_column = None
-        for col in chunks_df.columns:
-            if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
-                text_column = col
-                break
-        if text_column is None:
-            text_column = chunks_df.columns[0]
-        log_message("📝 Creating documents...")
-        documents = [Document(text=str(row[text_column]),
-                           metadata={"chunk_id": row.get('chunk_id', i),
-                                   "document_id": row.get('document_id', 'unknown'),
-                                   "document_name": row.get('document_name', 'unknown'),
-                                   "document_link": row.get('document_link', '')})
-                    for i, (_, row) in enumerate(chunks_df.iterrows())]
-        log_message("🔍 Building vector index...")
-        vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
-        retriever = VectorIndexRetriever(
-            index=vector_index,
-            similarity_top_k=RETRIEVER_TOP_K,
-            similarity_cutoff=SIMILARITY_THRESHOLD
-        )
-        custom_prompt_template = PromptTemplate(CUSTOM_PROMPT)
-        response_synthesizer = get_response_synthesizer(
-            response_mode=ResponseMode.TREE_SUMMARIZE,
-            text_qa_template=custom_prompt_template
-        )
-        query_engine = RetrieverQueryEngine(
-            retriever=retriever,
-            response_synthesizer=response_synthesizer
-        )
-        log_message("✅ System successfully initialized!")
-        return query_engine, chunks_df, True
-    except Exception as e:
-        log_message(f"❌ Initialization error: {str(e)}")
-        chunks_df = pd.DataFrame(columns=['chunk_id', 'document_id', 'document_name', 'document_link', 'chunk_text'])
-        return None, chunks_df, False

documents_prep.py CHANGED Viewed

@@ -54,38 +54,73 @@ def process_documents_with_chunking(documents):
         if doc_type == 'table':
             table_count += 1
-            if len(doc.text) > CHUNK_SIZE:
                 large_tables_count += 1
-                log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {len(doc.text)} characters")
-            all_chunked_docs.append(doc)
-            chunk_info.append({
-                'document_id': doc.metadata.get('document_id', 'unknown'),
-                'section_id': doc.metadata.get('section_id', 'unknown'),
-                'chunk_id': 0,
-                'chunk_size': len(doc.text),
-                'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                'type': 'table'
-            })
         elif doc_type == 'image':
             image_count += 1
-            if len(doc.text) > CHUNK_SIZE:
                 large_images_count += 1
-                log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {len(doc.text)} characters")
-            all_chunked_docs.append(doc)
-            chunk_info.append({
-                'document_id': doc.metadata.get('document_id', 'unknown'),
-                'section_id': doc.metadata.get('section_id', 'unknown'),
-                'chunk_id': 0,
-                'chunk_size': len(doc.text),
-                'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                'type': 'image'
-            })
-        else:
-            if len(doc.text) > CHUNK_SIZE:
                 chunked_docs = chunk_document(doc)
                 all_chunked_docs.extend(chunked_docs)
                 text_chunks_count += len(chunked_docs)
@@ -105,7 +140,7 @@ def process_documents_with_chunking(documents):
                     'document_id': doc.metadata.get('document_id', 'unknown'),
                     'section_id': doc.metadata.get('section_id', 'unknown'),
                     'chunk_id': 0,
-                    'chunk_size': len(doc.text),
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
                     'type': 'text'
                 })
@@ -120,6 +155,7 @@ def process_documents_with_chunking(documents):
     return all_chunked_docs, chunk_info
 def extract_text_from_json(data, document_id, document_name):
     documents = []
@@ -244,6 +280,7 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
                 documents = extract_zip_and_process_json(local_zip_path)
                 all_documents.extend(documents)
             except Exception as e:
                 log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
@@ -276,17 +313,18 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
                 log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
                 continue
         chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
-        log_message(f"Всего создано {len(all_documents)} исходных документов")
-        log_message(f"После chunking получено {len(chunked_documents)} чанков")
         return chunked_documents, chunk_info
     except Exception as e:
         log_message(f"Ошибка загрузки JSON документов: {str(e)}")
         return [], []
 def extract_section_title(section_text):
     if not section_text.strip():

         if doc_type == 'table':
             table_count += 1
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
                 large_tables_count += 1
+                log_message(f"Large table found: {doc.metadata.get('table_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
+                # Chunk large tables
+                chunked_docs = chunk_document(doc)
+                all_chunked_docs.extend(chunked_docs)
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'table',
+                        'table_number': chunk_doc.metadata.get('table_number', 'unknown')
+                    })
+            else:
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'table',
+                    'table_number': doc.metadata.get('table_number', 'unknown')
+                })
         elif doc_type == 'image':
             image_count += 1
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
                 large_images_count += 1
+                log_message(f"Large image description found: {doc.metadata.get('image_number', 'unknown')} in document {doc.metadata.get('document_id', 'unknown')}, size: {doc_size} characters")
+                # Chunk large images
+                chunked_docs = chunk_document(doc)
+                all_chunked_docs.extend(chunked_docs)
+                for i, chunk_doc in enumerate(chunked_docs):
+                    chunk_info.append({
+                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
+                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
+                        'chunk_id': i,
+                        'chunk_size': len(chunk_doc.text),
+                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
+                        'type': 'image',
+                        'image_number': chunk_doc.metadata.get('image_number', 'unknown')
+                    })
+            else:
+                all_chunked_docs.append(doc)
+                chunk_info.append({
+                    'document_id': doc.metadata.get('document_id', 'unknown'),
+                    'section_id': doc.metadata.get('section_id', 'unknown'),
+                    'chunk_id': 0,
+                    'chunk_size': doc_size,
+                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
+                    'type': 'image',
+                    'image_number': doc.metadata.get('image_number', 'unknown')
+                })
+        else:  # text documents
+            doc_size = len(doc.text)
+            if doc_size > CHUNK_SIZE:
                 chunked_docs = chunk_document(doc)
                 all_chunked_docs.extend(chunked_docs)
                 text_chunks_count += len(chunked_docs)
                     'document_id': doc.metadata.get('document_id', 'unknown'),
                     'section_id': doc.metadata.get('section_id', 'unknown'),
                     'chunk_id': 0,
+                    'chunk_size': doc_size,
                     'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
                     'type': 'text'
                 })
     return all_chunked_docs, chunk_info
 def extract_text_from_json(data, document_id, document_name):
     documents = []
                 documents = extract_zip_and_process_json(local_zip_path)
                 all_documents.extend(documents)
+                log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
             except Exception as e:
                 log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
                 log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
                 continue
+        log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
+        # Process documents through chunking function
         chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
+        log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
         return chunked_documents, chunk_info
     except Exception as e:
         log_message(f"Ошибка загрузки JSON документов: {str(e)}")
         return [], []
 def extract_section_title(section_text):
     if not section_text.strip():