Spaces:

HomemadeMirpur
/

ed-cad-ref

Sleeping

App Files Files Community

SakibAhmed commited on Feb 17

Commit

ffa9c75

verified ·

1 Parent(s): 4a461a4

Upload 2 files

Browse files

Files changed (2) hide show

chunker.py +13 -57
rag_components.py +7 -4

chunker.py CHANGED Viewed

@@ -4,8 +4,10 @@ import json
 import argparse
 from typing import List, Dict, Optional
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-# MODIFIED: Import the text extraction utility to avoid code duplication
 from utils import extract_text_from_file, FAISS_RAG_SUPPORTED_EXTENSIONS
 # --- Logging Setup ---
@@ -18,10 +20,6 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-# Note: The 'extract_text_from_file' and 'SUPPORTED_EXTENSIONS' dictionary
-# have been removed from this file and are now imported from 'utils.py'
-# to ensure a single source of truth for file processing logic.
 def process_sources_and_create_chunks(
     sources_dir: str,
     output_file: str,
@@ -29,11 +27,6 @@ def process_sources_and_create_chunks(
     chunk_overlap: int = 150,
     text_output_dir: Optional[str] = None
 ) -> None:
-    """
-    Scans a directory for source files, extracts text, splits it into chunks,
-    and saves the chunks to a single JSON file.
-    Optionally saves the raw extracted text to a specified directory.
-    """
     if not os.path.isdir(sources_dir):
         logger.error(f"Source directory not found: '{sources_dir}'")
         raise FileNotFoundError(f"Source directory not found: '{sources_dir}'")
@@ -60,7 +53,6 @@ def process_sources_and_create_chunks(
             continue
         logger.info(f"Processing source file: {filename}")
-        # MODIFIED: Use the imported function
         text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
         if text_content:
@@ -69,15 +61,10 @@ def process_sources_and_create_chunks(
                     text_output_path = os.path.join(text_output_dir, f"{filename}.txt")
                     with open(text_output_path, 'w', encoding='utf-8') as f_text:
                         f_text.write(text_content)
-                    logger.info(f"Saved extracted text for '{filename}' to '{text_output_path}'")
                 except Exception as e_text_save:
                     logger.error(f"Could not save extracted text for '{filename}': {e_text_save}")
             chunks = text_splitter.split_text(text_content)
-            if not chunks:
-                logger.warning(f"No chunks generated from {filename}. Skipping.")
-                continue
             for i, chunk_text in enumerate(chunks):
                 chunk_data = {
                     "page_content": chunk_text,
@@ -90,11 +77,9 @@ def process_sources_and_create_chunks(
                 all_chunks_for_json.append(chunk_data)
             processed_files_count += 1
-        else:
-            logger.warning(f"Could not extract text from {filename}. Skipping.")
     if not all_chunks_for_json:
-        logger.warning(f"No processable documents found or no text extracted in '{sources_dir}'. JSON file will be empty.")
     output_dir = os.path.dirname(output_file)
     os.makedirs(output_dir, exist_ok=True)
@@ -102,44 +87,15 @@ def process_sources_and_create_chunks(
     with open(output_file, 'w', encoding='utf-8') as f:
         json.dump(all_chunks_for_json, f, indent=2)
-    logger.info(f"Chunking complete. Processed {processed_files_count} files.")
-    logger.info(f"Created a total of {len(all_chunks_for_json)} chunks.")
-    logger.info(f"Chunked JSON output saved to: {output_file}")
 def main():
-    parser = argparse.ArgumentParser(description="Process source documents into a JSON file of text chunks for RAG.")
-    parser.add_argument(
-        '--sources-dir',
-        type=str,
-        required=True,
-        help="The directory containing source files (PDFs, DOCX, TXT)."
-    )
-    parser.add_argument(
-        '--output-file',
-        type=str,
-        required=True,
-        help="The full path for the output JSON file containing the chunks."
-    )
-    parser.add_argument(
-        '--text-output-dir',
-        type=str,
-        default=None,
-        help="Optional: The directory to save raw extracted text files for debugging."
-    )
-    parser.add_argument(
-        '--chunk-size',
-        type=int,
-        default=1000,
-        help="The character size for each text chunk."
-    )
-    parser.add_argument(
-        '--chunk-overlap',
-        type=int,
-        default=150,
-        help="The character overlap between consecutive chunks."
-    )
     args = parser.parse_args()
     try:
@@ -151,7 +107,7 @@ def main():
             text_output_dir=args.text_output_dir
         )
     except Exception as e:
-        logger.critical(f"A critical error occurred during the chunking process: {e}", exc_info=True)
         exit(1)
 if __name__ == "__main__":

 import argparse
 from typing import List, Dict, Optional
+# --- UPDATED IMPORT ---
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# ----------------------
 from utils import extract_text_from_file, FAISS_RAG_SUPPORTED_EXTENSIONS
 # --- Logging Setup ---
 )
 logger = logging.getLogger(__name__)
 def process_sources_and_create_chunks(
     sources_dir: str,
     output_file: str,
     chunk_overlap: int = 150,
     text_output_dir: Optional[str] = None
 ) -> None:
     if not os.path.isdir(sources_dir):
         logger.error(f"Source directory not found: '{sources_dir}'")
         raise FileNotFoundError(f"Source directory not found: '{sources_dir}'")
             continue
         logger.info(f"Processing source file: {filename}")
         text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
         if text_content:
                     text_output_path = os.path.join(text_output_dir, f"{filename}.txt")
                     with open(text_output_path, 'w', encoding='utf-8') as f_text:
                         f_text.write(text_content)
                 except Exception as e_text_save:
                     logger.error(f"Could not save extracted text for '{filename}': {e_text_save}")
             chunks = text_splitter.split_text(text_content)
             for i, chunk_text in enumerate(chunks):
                 chunk_data = {
                     "page_content": chunk_text,
                 all_chunks_for_json.append(chunk_data)
             processed_files_count += 1
     if not all_chunks_for_json:
+        logger.warning(f"No processable documents found in '{sources_dir}'.")
     output_dir = os.path.dirname(output_file)
     os.makedirs(output_dir, exist_ok=True)
     with open(output_file, 'w', encoding='utf-8') as f:
         json.dump(all_chunks_for_json, f, indent=2)
+    logger.info(f"Chunking complete. Processed {processed_files_count} files. Total chunks: {len(all_chunks_for_json)}")
 def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--sources-dir', type=str, required=True)
+    parser.add_argument('--output-file', type=str, required=True)
+    parser.add_argument('--text-output-dir', type=str, default=None)
+    parser.add_argument('--chunk-size', type=int, default=1000)
+    parser.add_argument('--chunk-overlap', type=int, default=150)
     args = parser.parse_args()
     try:
             text_output_dir=args.text_output_dir
         )
     except Exception as e:
+        logger.critical(f"Chunking failed: {e}", exc_info=True)
         exit(1)
 if __name__ == "__main__":

rag_components.py CHANGED Viewed

@@ -9,9 +9,13 @@ from sentence_transformers import CrossEncoder
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-from langchain.schema import Document, BaseRetriever
-from langchain.callbacks.manager import CallbackManagerForRetrieverRun
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from config import (
     RAG_RERANKER_MODEL_NAME, RAG_DETAILED_LOGGING,
@@ -227,7 +231,6 @@ class KnowledgeRAG:
         self.logger.info("[INDEX_LOAD] Success.")
-    # --- RESTORED: Incremental Index Update ---
     def update_index_with_new_files(self, source_folder_path: str, max_files_to_process: Optional[int] = None) -> Dict[str, Any]:
         self.logger.info(f"[INDEX_UPDATE] Checking for new files in: {source_folder_path}")

 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
+# --- UPDATED IMPORTS FOR NEW LANGCHAIN ---
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# -----------------------------------------
 from config import (
     RAG_RERANKER_MODEL_NAME, RAG_DETAILED_LOGGING,
         self.logger.info("[INDEX_LOAD] Success.")
     def update_index_with_new_files(self, source_folder_path: str, max_files_to_process: Optional[int] = None) -> Dict[str, Any]:
         self.logger.info(f"[INDEX_UPDATE] Checking for new files in: {source_folder_path}")