SakibAhmed commited on
Commit
ffa9c75
·
verified ·
1 Parent(s): 4a461a4

Upload 2 files

Browse files
Files changed (2) hide show
  1. chunker.py +13 -57
  2. rag_components.py +7 -4
chunker.py CHANGED
@@ -4,8 +4,10 @@ import json
4
  import argparse
5
  from typing import List, Dict, Optional
6
 
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- # MODIFIED: Import the text extraction utility to avoid code duplication
 
 
9
  from utils import extract_text_from_file, FAISS_RAG_SUPPORTED_EXTENSIONS
10
 
11
  # --- Logging Setup ---
@@ -18,10 +20,6 @@ logging.basicConfig(
18
  )
19
  logger = logging.getLogger(__name__)
20
 
21
- # Note: The 'extract_text_from_file' and 'SUPPORTED_EXTENSIONS' dictionary
22
- # have been removed from this file and are now imported from 'utils.py'
23
- # to ensure a single source of truth for file processing logic.
24
-
25
  def process_sources_and_create_chunks(
26
  sources_dir: str,
27
  output_file: str,
@@ -29,11 +27,6 @@ def process_sources_and_create_chunks(
29
  chunk_overlap: int = 150,
30
  text_output_dir: Optional[str] = None
31
  ) -> None:
32
- """
33
- Scans a directory for source files, extracts text, splits it into chunks,
34
- and saves the chunks to a single JSON file.
35
- Optionally saves the raw extracted text to a specified directory.
36
- """
37
  if not os.path.isdir(sources_dir):
38
  logger.error(f"Source directory not found: '{sources_dir}'")
39
  raise FileNotFoundError(f"Source directory not found: '{sources_dir}'")
@@ -60,7 +53,6 @@ def process_sources_and_create_chunks(
60
  continue
61
 
62
  logger.info(f"Processing source file: {filename}")
63
- # MODIFIED: Use the imported function
64
  text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
65
 
66
  if text_content:
@@ -69,15 +61,10 @@ def process_sources_and_create_chunks(
69
  text_output_path = os.path.join(text_output_dir, f"{filename}.txt")
70
  with open(text_output_path, 'w', encoding='utf-8') as f_text:
71
  f_text.write(text_content)
72
- logger.info(f"Saved extracted text for '{filename}' to '{text_output_path}'")
73
  except Exception as e_text_save:
74
  logger.error(f"Could not save extracted text for '{filename}': {e_text_save}")
75
 
76
  chunks = text_splitter.split_text(text_content)
77
- if not chunks:
78
- logger.warning(f"No chunks generated from {filename}. Skipping.")
79
- continue
80
-
81
  for i, chunk_text in enumerate(chunks):
82
  chunk_data = {
83
  "page_content": chunk_text,
@@ -90,11 +77,9 @@ def process_sources_and_create_chunks(
90
  all_chunks_for_json.append(chunk_data)
91
 
92
  processed_files_count += 1
93
- else:
94
- logger.warning(f"Could not extract text from {filename}. Skipping.")
95
 
96
  if not all_chunks_for_json:
97
- logger.warning(f"No processable documents found or no text extracted in '{sources_dir}'. JSON file will be empty.")
98
 
99
  output_dir = os.path.dirname(output_file)
100
  os.makedirs(output_dir, exist_ok=True)
@@ -102,44 +87,15 @@ def process_sources_and_create_chunks(
102
  with open(output_file, 'w', encoding='utf-8') as f:
103
  json.dump(all_chunks_for_json, f, indent=2)
104
 
105
- logger.info(f"Chunking complete. Processed {processed_files_count} files.")
106
- logger.info(f"Created a total of {len(all_chunks_for_json)} chunks.")
107
- logger.info(f"Chunked JSON output saved to: {output_file}")
108
-
109
 
110
  def main():
111
- parser = argparse.ArgumentParser(description="Process source documents into a JSON file of text chunks for RAG.")
112
- parser.add_argument(
113
- '--sources-dir',
114
- type=str,
115
- required=True,
116
- help="The directory containing source files (PDFs, DOCX, TXT)."
117
- )
118
- parser.add_argument(
119
- '--output-file',
120
- type=str,
121
- required=True,
122
- help="The full path for the output JSON file containing the chunks."
123
- )
124
- parser.add_argument(
125
- '--text-output-dir',
126
- type=str,
127
- default=None,
128
- help="Optional: The directory to save raw extracted text files for debugging."
129
- )
130
- parser.add_argument(
131
- '--chunk-size',
132
- type=int,
133
- default=1000,
134
- help="The character size for each text chunk."
135
- )
136
- parser.add_argument(
137
- '--chunk-overlap',
138
- type=int,
139
- default=150,
140
- help="The character overlap between consecutive chunks."
141
- )
142
-
143
  args = parser.parse_args()
144
 
145
  try:
@@ -151,7 +107,7 @@ def main():
151
  text_output_dir=args.text_output_dir
152
  )
153
  except Exception as e:
154
- logger.critical(f"A critical error occurred during the chunking process: {e}", exc_info=True)
155
  exit(1)
156
 
157
  if __name__ == "__main__":
 
4
  import argparse
5
  from typing import List, Dict, Optional
6
 
7
+ # --- UPDATED IMPORT ---
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ # ----------------------
10
+
11
  from utils import extract_text_from_file, FAISS_RAG_SUPPORTED_EXTENSIONS
12
 
13
  # --- Logging Setup ---
 
20
  )
21
  logger = logging.getLogger(__name__)
22
 
 
 
 
 
23
  def process_sources_and_create_chunks(
24
  sources_dir: str,
25
  output_file: str,
 
27
  chunk_overlap: int = 150,
28
  text_output_dir: Optional[str] = None
29
  ) -> None:
 
 
 
 
 
30
  if not os.path.isdir(sources_dir):
31
  logger.error(f"Source directory not found: '{sources_dir}'")
32
  raise FileNotFoundError(f"Source directory not found: '{sources_dir}'")
 
53
  continue
54
 
55
  logger.info(f"Processing source file: {filename}")
 
56
  text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
57
 
58
  if text_content:
 
61
  text_output_path = os.path.join(text_output_dir, f"{filename}.txt")
62
  with open(text_output_path, 'w', encoding='utf-8') as f_text:
63
  f_text.write(text_content)
 
64
  except Exception as e_text_save:
65
  logger.error(f"Could not save extracted text for '{filename}': {e_text_save}")
66
 
67
  chunks = text_splitter.split_text(text_content)
 
 
 
 
68
  for i, chunk_text in enumerate(chunks):
69
  chunk_data = {
70
  "page_content": chunk_text,
 
77
  all_chunks_for_json.append(chunk_data)
78
 
79
  processed_files_count += 1
 
 
80
 
81
  if not all_chunks_for_json:
82
+ logger.warning(f"No processable documents found in '{sources_dir}'.")
83
 
84
  output_dir = os.path.dirname(output_file)
85
  os.makedirs(output_dir, exist_ok=True)
 
87
  with open(output_file, 'w', encoding='utf-8') as f:
88
  json.dump(all_chunks_for_json, f, indent=2)
89
 
90
+ logger.info(f"Chunking complete. Processed {processed_files_count} files. Total chunks: {len(all_chunks_for_json)}")
 
 
 
91
 
92
  def main():
93
+ parser = argparse.ArgumentParser()
94
+ parser.add_argument('--sources-dir', type=str, required=True)
95
+ parser.add_argument('--output-file', type=str, required=True)
96
+ parser.add_argument('--text-output-dir', type=str, default=None)
97
+ parser.add_argument('--chunk-size', type=int, default=1000)
98
+ parser.add_argument('--chunk-overlap', type=int, default=150)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  args = parser.parse_args()
100
 
101
  try:
 
107
  text_output_dir=args.text_output_dir
108
  )
109
  except Exception as e:
110
+ logger.critical(f"Chunking failed: {e}", exc_info=True)
111
  exit(1)
112
 
113
  if __name__ == "__main__":
rag_components.py CHANGED
@@ -9,9 +9,13 @@ from sentence_transformers import CrossEncoder
9
 
10
  from langchain_huggingface import HuggingFaceEmbeddings
11
  from langchain_community.vectorstores import FAISS
12
- from langchain.schema import Document, BaseRetriever
13
- from langchain.callbacks.manager import CallbackManagerForRetrieverRun
14
- from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
 
15
 
16
  from config import (
17
  RAG_RERANKER_MODEL_NAME, RAG_DETAILED_LOGGING,
@@ -227,7 +231,6 @@ class KnowledgeRAG:
227
 
228
  self.logger.info("[INDEX_LOAD] Success.")
229
 
230
- # --- RESTORED: Incremental Index Update ---
231
  def update_index_with_new_files(self, source_folder_path: str, max_files_to_process: Optional[int] = None) -> Dict[str, Any]:
232
  self.logger.info(f"[INDEX_UPDATE] Checking for new files in: {source_folder_path}")
233
 
 
9
 
10
  from langchain_huggingface import HuggingFaceEmbeddings
11
  from langchain_community.vectorstores import FAISS
12
+
13
+ # --- UPDATED IMPORTS FOR NEW LANGCHAIN ---
14
+ from langchain_core.documents import Document
15
+ from langchain_core.retrievers import BaseRetriever
16
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
17
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
18
+ # -----------------------------------------
19
 
20
  from config import (
21
  RAG_RERANKER_MODEL_NAME, RAG_DETAILED_LOGGING,
 
231
 
232
  self.logger.info("[INDEX_LOAD] Success.")
233
 
 
234
  def update_index_with_new_files(self, source_folder_path: str, max_files_to_process: Optional[int] = None) -> Dict[str, Any]:
235
  self.logger.info(f"[INDEX_UPDATE] Checking for new files in: {source_folder_path}")
236