Jatin Mehra commited on
Commit
d67ce94
·
1 Parent(s): b0ce8b2

Refactor configuration handling in EnhancedDocumentProcessor to use centralized Config class for model and logging settings

Browse files
rag_elements/config.py CHANGED
@@ -2,7 +2,7 @@
2
  # This file contains all configurable parameters for the EnhancedDocumentProcessor
3
 
4
  # Model Configuration
5
- class Config:
6
  # Model Names
7
  CHAT_LLM_MODEL = "llama-3.3-70b-versatile"
8
  VISION_LLM_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
@@ -24,24 +24,6 @@ class Config:
24
  CONTENT_HASH_LENGTH = 8
25
  SOURCE_HASH_LENGTH = 8
26
 
27
- # File Type Configuration
28
- SUPPORTED_EXTENSIONS = {
29
- '.pdf': 'pdf',
30
- '.txt': 'text',
31
- '.md': 'text',
32
- '.py': 'text',
33
- '.js': 'text',
34
- '.html': 'text',
35
- '.csv': 'text',
36
- '.json': 'text',
37
- '.png': 'image',
38
- '.jpg': 'image',
39
- '.jpeg': 'image',
40
- '.bmp': 'image',
41
- '.tiff': 'image',
42
- '.webp': 'image'
43
- }
44
-
45
  # OCR Configuration
46
  OCR_PROMPT = (
47
  "Extract all the text from this image. "
 
2
  # This file contains all configurable parameters for the EnhancedDocumentProcessor
3
 
4
  # Model Configuration
5
+ class Config:
6
  # Model Names
7
  CHAT_LLM_MODEL = "llama-3.3-70b-versatile"
8
  VISION_LLM_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
 
24
  CONTENT_HASH_LENGTH = 8
25
  SOURCE_HASH_LENGTH = 8
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # OCR Configuration
28
  OCR_PROMPT = (
29
  "Extract all the text from this image. "
rag_elements/enhanced_vectordb.py CHANGED
@@ -20,11 +20,13 @@ from langchain.schema.messages import HumanMessage
20
  from dotenv import load_dotenv
21
  import re
22
 
 
 
23
  # Load environment variables
24
  load_dotenv()
25
 
26
  # Configure logging
27
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
28
  logger = logging.getLogger(__name__)
29
 
30
  class EnhancedDocumentProcessor:
@@ -40,25 +42,25 @@ class EnhancedDocumentProcessor:
40
  self.vision_llm = None
41
  else:
42
  self.vision_llm = ChatGroq(
43
- model="meta-llama/llama-4-scout-17b-16e-instruct",
44
  api_key=self.groq_api_key
45
  )
46
 
47
  # Initialize chat model for analysis
48
  self.chat_llm = ChatGroq(
49
- model="llama-3.3-70b-versatile",
50
  api_key=self.groq_api_key
51
  ) if self.groq_api_key else None
52
 
53
  # Initialize embeddings
54
- self.embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
55
 
56
  # Initialize text splitter with better chunk tracking
57
  self.text_splitter = RecursiveCharacterTextSplitter(
58
- chunk_size=800,
59
- chunk_overlap=100,
60
  length_function=len,
61
- separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
62
  )
63
 
64
  # Document tracking
@@ -86,8 +88,8 @@ class EnhancedDocumentProcessor:
86
 
87
  def _generate_chunk_id(self, content: str, source: str, chunk_index: int) -> str:
88
  """Generate a unique ID for a document chunk."""
89
- content_hash = hashlib.md5(content.encode()).hexdigest()[:8]
90
- source_hash = hashlib.md5(source.encode()).hexdigest()[:8]
91
  return f"{source_hash}_{chunk_index}_{content_hash}"
92
 
93
  def _extract_sentences(self, text: str) -> List[Tuple[str, int, int]]:
@@ -128,6 +130,7 @@ class EnhancedDocumentProcessor:
128
  {
129
  "type": "text",
130
  "text": (
 
131
  "Extract all the text from this image. "
132
  "Preserve the structure and formatting as much as possible. "
133
  "If there's no text, return 'No text found'."
@@ -281,7 +284,7 @@ class EnhancedDocumentProcessor:
281
  logger.info(f"Successfully processed {len(documents)} documents from {len(file_paths)} files")
282
  return documents
283
 
284
- def process_directory(self, directory_path: str, recursive: bool = True) -> List[Document]:
285
  """Process all supported files in a directory."""
286
  documents = []
287
  directory = Path(directory_path)
@@ -364,7 +367,7 @@ class EnhancedDocumentProcessor:
364
  logger.info(f"Successfully created FAISS vector store with {len(enhanced_chunks)} chunks")
365
  return vector_store
366
 
367
- def search_with_citations(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
368
  """Search for similar documents and return results with citation information."""
369
  if not self.vector_store:
370
  logger.error("No vector store available. Create or load one first.")
@@ -418,27 +421,10 @@ class EnhancedDocumentProcessor:
418
  contents = [result["content"] for result in search_results]
419
  combined_content = "\n\n---\n\n".join(contents)
420
 
421
- theme_analysis_prompt = f"""
422
- Analyze the following document excerpts and identify common themes related to the query: "{query}"
423
-
424
- Document excerpts:
425
- {combined_content}
426
-
427
- Please provide:
428
- 1. A list of 3-5 main themes/topics that appear across these documents
429
- 2. A brief summary of how these themes relate to the query
430
- 3. Key insights or patterns you notice
431
-
432
- Format your response as JSON with the following structure:
433
- {{
434
- "themes": [
435
- {{"name": "Theme Name", "description": "Brief description", "frequency": "how often it appears"}},
436
- ...
437
- ],
438
- "summary": "Overall summary of themes",
439
- "insights": ["Key insight 1", "Key insight 2", ...]
440
- }}
441
- """
442
 
443
  response = self.chat_llm.invoke(theme_analysis_prompt)
444
 
@@ -476,7 +462,7 @@ class EnhancedDocumentProcessor:
476
  metadata = {
477
  "num_documents": len(self.processed_documents),
478
  "num_chunks": self.vector_store.index.ntotal,
479
- "embedding_model": "all-MiniLM-L6-v2",
480
  "processed_files": [
481
  {
482
  "source": doc.metadata.get("source", ""),
@@ -490,7 +476,7 @@ class EnhancedDocumentProcessor:
490
  "chunk_overlap": self.text_splitter._chunk_overlap
491
  }
492
 
493
- with open(f"{save_path}/enhanced_metadata.json", "w") as f:
494
  json.dump(metadata, f, indent=2)
495
 
496
  logger.info(f"Enhanced vector store saved to {save_path}")
@@ -504,12 +490,12 @@ class EnhancedDocumentProcessor:
504
  vector_store = FAISS.load_local(
505
  load_path,
506
  self.embeddings,
507
- allow_dangerous_deserialization=True
508
  )
509
  self.vector_store = vector_store
510
 
511
  # Load enhanced metadata if available
512
- metadata_path = f"{load_path}/enhanced_metadata.json"
513
  if os.path.exists(metadata_path):
514
  with open(metadata_path, "r") as f:
515
  metadata = json.load(f)
 
20
  from dotenv import load_dotenv
21
  import re
22
 
23
+ from rag_elements.config import Config
24
+
25
  # Load environment variables
26
  load_dotenv()
27
 
28
  # Configure logging
29
+ logging.basicConfig(level=getattr(logging, Config.LOG_LEVEL), format=Config.LOG_FORMAT)
30
  logger = logging.getLogger(__name__)
31
 
32
  class EnhancedDocumentProcessor:
 
42
  self.vision_llm = None
43
  else:
44
  self.vision_llm = ChatGroq(
45
+ model=Config.VISION_LLM_MODEL,
46
  api_key=self.groq_api_key
47
  )
48
 
49
  # Initialize chat model for analysis
50
  self.chat_llm = ChatGroq(
51
+ model=Config.CHAT_LLM_MODEL,
52
  api_key=self.groq_api_key
53
  ) if self.groq_api_key else None
54
 
55
  # Initialize embeddings
56
+ self.embeddings = SentenceTransformerEmbeddings(model_name=Config.EMBEDDINGS_MODEL)
57
 
58
  # Initialize text splitter with better chunk tracking
59
  self.text_splitter = RecursiveCharacterTextSplitter(
60
+ chunk_size=Config.CHUNK_SIZE,
61
+ chunk_overlap= Config.CHUNK_OVERLAP,
62
  length_function=len,
63
+ separators= Config.CHUNK_SEPARATORS
64
  )
65
 
66
  # Document tracking
 
88
 
89
  def _generate_chunk_id(self, content: str, source: str, chunk_index: int) -> str:
90
  """Generate a unique ID for a document chunk."""
91
+ content_hash = hashlib.md5(content.encode()).hexdigest()[:Config.CONTENT_HASH_LENGTH]
92
+ source_hash = hashlib.md5(source.encode()).hexdigest()[:Config.SOURCE_HASH_LENGTH]
93
  return f"{source_hash}_{chunk_index}_{content_hash}"
94
 
95
  def _extract_sentences(self, text: str) -> List[Tuple[str, int, int]]:
 
130
  {
131
  "type": "text",
132
  "text": (
133
+ Config.OCR_PROMPT if Config.OCR_PROMPT else
134
  "Extract all the text from this image. "
135
  "Preserve the structure and formatting as much as possible. "
136
  "If there's no text, return 'No text found'."
 
284
  logger.info(f"Successfully processed {len(documents)} documents from {len(file_paths)} files")
285
  return documents
286
 
287
+ def process_directory(self, directory_path: str, recursive: bool = Config.ENABLE_RECURSIVE_DIRECTORY_PROCESSING) -> List[Document]:
288
  """Process all supported files in a directory."""
289
  documents = []
290
  directory = Path(directory_path)
 
367
  logger.info(f"Successfully created FAISS vector store with {len(enhanced_chunks)} chunks")
368
  return vector_store
369
 
370
+ def search_with_citations(self, query: str, k: int = Config.DEFAULT_SEARCH_K) -> List[Dict[str, Any]]:
371
  """Search for similar documents and return results with citation information."""
372
  if not self.vector_store:
373
  logger.error("No vector store available. Create or load one first.")
 
421
  contents = [result["content"] for result in search_results]
422
  combined_content = "\n\n---\n\n".join(contents)
423
 
424
+ theme_analysis_prompt = Config.THEME_ANALYSIS_PROMPT_TEMPLATE.format(
425
+ query=query,
426
+ content=combined_content[:Config.MAX_CONTENT_LENGTH_FOR_THEME_ANALYSIS]
427
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
  response = self.chat_llm.invoke(theme_analysis_prompt)
430
 
 
462
  metadata = {
463
  "num_documents": len(self.processed_documents),
464
  "num_chunks": self.vector_store.index.ntotal,
465
+ "embedding_model": Config.EMBEDDINGS_MODEL,
466
  "processed_files": [
467
  {
468
  "source": doc.metadata.get("source", ""),
 
476
  "chunk_overlap": self.text_splitter._chunk_overlap
477
  }
478
 
479
+ with open(f"{save_path}/{Config.ENHANCED_METADATA_FILENAME}", "w") as f:
480
  json.dump(metadata, f, indent=2)
481
 
482
  logger.info(f"Enhanced vector store saved to {save_path}")
 
490
  vector_store = FAISS.load_local(
491
  load_path,
492
  self.embeddings,
493
+ allow_dangerous_deserialization=Config.ENABLE_DANGEROUS_DESERIALIZATION
494
  )
495
  self.vector_store = vector_store
496
 
497
  # Load enhanced metadata if available
498
+ metadata_path = f"{load_path}/{Config.ENHANCED_METADATA_FILENAME}"
499
  if os.path.exists(metadata_path):
500
  with open(metadata_path, "r") as f:
501
  metadata = json.load(f)