Spaces:

yashgori20
/

Swiftcheck

Sleeping

App Files Files Community

yashgori20 commited on May 26, 2025

Commit

7808f20

verified ·

1 Parent(s): 78e0b1a

Update rag_utils.py

Browse files

Files changed (1) hide show

rag_utils.py +572 -572

rag_utils.py CHANGED Viewed

@@ -1,572 +1,572 @@
-import json
-import sqlite3
-from pathlib import Path
-from typing import List, Dict, Optional, Tuple
-import chromadb
-from chromadb import Settings
-from sentence_transformers import SentenceTransformer
-from datetime import datetime
-class EnhancedRAGUtils:
-    def __init__(self, vector_stores_path: str = "./vector_stores"):
-        self.vector_stores_path = Path(vector_stores_path)
-        # Initialize embedding model (shared across all VDBs)
-        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
-        # Initialize all VDB connections
-        self._init_regulatory_vdb()
-        self._init_product_spec_vdb()
-        self._init_checklist_examples_vdb()
-        print("Enhanced RAG Utils initialized with 3 vector databases")
-    def _init_regulatory_vdb(self):
-        """Initialize regulatory guidelines VDB"""
-        try:
-            self.regulatory_chroma_path = self.vector_stores_path / "chroma_db" / "regulatory_docs"
-            self.regulatory_metadata_db = self.regulatory_chroma_path / "metadata" / "regulatory_metadata.db"
-            self.regulatory_client = chromadb.PersistentClient(
-                path=str(self.regulatory_chroma_path),
-                settings=Settings(anonymized_telemetry=False)
-            )
-            self.regulatory_collection = self.regulatory_client.get_collection("regulatory_guidelines")
-            print("✓ Regulatory VDB connected")
-        except Exception as e:
-            print(f"⚠ Regulatory VDB not available: {e}")
-            self.regulatory_collection = None
-    def _init_product_spec_vdb(self):
-        """Initialize product specifications VDB"""
-        try:
-            self.product_spec_chroma_path = self.vector_stores_path / "chroma_db" / "product_specifications"
-            self.product_spec_metadata_db = self.product_spec_chroma_path / "metadata" / "product_metadata.db"
-            self.product_spec_client = chromadb.PersistentClient(
-                path=str(self.product_spec_chroma_path),
-                settings=Settings(anonymized_telemetry=False)
-            )
-            self.product_spec_collection = self.product_spec_client.get_collection("product_specifications")
-            print("✓ Product Specifications VDB connected")
-        except Exception as e:
-            print(f"⚠ Product Specifications VDB not available: {e}")
-            self.product_spec_collection = None
-    def _init_checklist_examples_vdb(self):
-        """Initialize checklist examples VDB"""
-        try:
-            self.checklist_chroma_path = self.vector_stores_path / "chroma_db" / "checklist_examples"
-            self.checklist_metadata_db = self.checklist_chroma_path / "metadata" / "checklist_structures.db"
-            self.checklist_client = chromadb.PersistentClient(
-                path=str(self.checklist_chroma_path),
-                settings=Settings(anonymized_telemetry=False)
-            )
-            self.checklist_collection = self.checklist_client.get_collection("checklist_examples")
-            print("✓ Checklist Examples VDB connected")
-        except Exception as e:
-            print(f"⚠ Checklist Examples VDB not available: {e}")
-            self.checklist_collection = None
-    def retrieve_regulatory_requirements(self, product_name: str, domain: str = "Food Manufacturing", k: int = 3) -> List[Dict]:
-        """Retrieve relevant regulatory requirements with clause references"""
-        if not self.regulatory_collection:
-            return []
-        try:
-            # Create targeted query
-            query_text = f"{product_name} {domain} regulatory requirements compliance standards Dubai UAE HACCP"
-            query_embedding = self.embedder.encode(query_text).tolist()
-            # Query ChromaDB
-            results = self.regulatory_collection.query(
-                query_embeddings=[query_embedding],
-                n_results=k
-            )
-            guidelines = []
-            if results['documents'] and results['documents'][0]:
-                for i, doc in enumerate(results['documents'][0]):
-                    metadata = results['metadatas'][0][i]
-                    # Get clause reference from metadata
-                    clause_ref = self._extract_clause_reference(metadata, doc)
-                    guidelines.append({
-                        "text": doc[:800],  # Limit text length
-                        "regulatory_body": metadata.get('regulatory_body', 'Unknown'),
-                        "standard_code": metadata.get('standard_code', ''),
-                        "clause_reference": clause_ref,
-                        "topics": metadata.get('topics', ''),
-                        "jurisdiction": metadata.get('jurisdiction', 'UAE'),
-                        "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0.5,
-                        "source_type": "regulatory"
-                    })
-            # Sort by relevance and get additional metadata from SQLite
-            guidelines = sorted(guidelines, key=lambda x: x['relevance_score'], reverse=True)
-            return self._enrich_regulatory_data(guidelines)
-        except Exception as e:
-            print(f"Error retrieving regulatory requirements: {str(e)}")
-            return []
-    def retrieve_product_specifications(self, product_name: str, k: int = 3) -> List[Dict]:
-        """Retrieve similar product specifications for depth reference"""
-        if not self.product_spec_collection:
-            return []
-        try:
-            # Create product-focused query
-            query_text = f"{product_name} product specification quality parameters tolerance limits"
-            query_embedding = self.embedder.encode(query_text).tolist()
-            # Query ChromaDB
-            results = self.product_spec_collection.query(
-                query_embeddings=[query_embedding],
-                n_results=k
-            )
-            specifications = []
-            if results['documents'] and results['documents'][0]:
-                for i, doc in enumerate(results['documents'][0]):
-                    metadata = results['metadatas'][0][i]
-                    specifications.append({
-                        "text": doc[:600],
-                        "product_name": metadata.get('product_name', 'Unknown'),
-                        "supplier": metadata.get('supplier', 'Unknown'),
-                        "category": metadata.get('product_category', 'Unknown'),
-                        "specification_type": metadata.get('specification_type', 'Unknown'),
-                        "parameters_count": metadata.get('total_parameters', 0),
-                        "detail_level": metadata.get('detail_level', 'standard'),
-                        "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0.5,
-                        "source_type": "product_spec"
-                    })
-            return sorted(specifications, key=lambda x: x['relevance_score'], reverse=True)
-        except Exception as e:
-            print(f"Error retrieving product specifications: {str(e)}")
-            return []
-    def retrieve_checklist_examples(self, product_name: str, k: int = 3) -> List[Dict]:
-        """Retrieve similar checklist examples with parameter structures"""
-        if not self.checklist_collection:
-            return []
-        try:
-            # Create checklist-focused query
-            query_text = f"{product_name} quality control inspection checklist parameters"
-            query_embedding = self.embedder.encode(query_text).tolist()
-            # Query ChromaDB
-            results = self.checklist_collection.query(
-                query_embeddings=[query_embedding],
-                n_results=k
-            )
-            examples = []
-            if results['documents'] and results['documents'][0]:
-                for i, doc in enumerate(results['documents'][0]):
-                    metadata = results['metadatas'][0][i]
-                    # Get parameter structures from metadata
-                    parameter_info = self._extract_parameter_structure(metadata)
-                    examples.append({
-                        "text": doc[:500],
-                        "document_type": metadata.get('document_type', 'QC Checklist'),
-                        "product_name": metadata.get('product_name', 'Unknown'),
-                        "checklist_category": metadata.get('checklist_category', 'General'),
-                        "total_parameters": metadata.get('total_parameters', 0),
-                        "parameter_types": metadata.get('parameter_types', []),
-                        "input_methods": metadata.get('input_methods', []),
-                        "parameter_structure": parameter_info,
-                        "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0.5,
-                        "source_type": "checklist_example"
-                    })
-            # Enrich with detailed parameter data from SQLite
-            return self._enrich_checklist_data(examples)
-        except Exception as e:
-            print(f"Error retrieving checklist examples: {str(e)}")
-            return []
-    def retrieve_parameter_patterns(self, product_category: str = "", k: int = 10) -> List[Dict]:
-        """Retrieve common parameter patterns for intelligent type selection"""
-        if not self.checklist_metadata_db.exists():
-            return []
-        try:
-            conn = sqlite3.connect(self.checklist_metadata_db)
-            cursor = conn.cursor()
-            # Get parameter patterns with usage statistics
-            query = """
-                SELECT
-                    cp.parameter_name,
-                    cp.parameter_type,
-                    cp.input_method,
-                    cp.specifications,
-                    cp.options_list,
-                    cp.tolerance_limits,
-                    cp.measurement_units,
-                    cp.has_remarks,
-                    COUNT(*) as usage_frequency,
-                    GROUP_CONCAT(DISTINCT cd.product_name) as used_in_products
-                FROM checklist_parameters cp
-                JOIN checklist_documents cd ON cp.file_hash = cd.file_hash
-                WHERE (? = '' OR cd.checklist_category LIKE ?)
-                GROUP BY cp.parameter_name, cp.parameter_type, cp.input_method
-                ORDER BY usage_frequency DESC, cp.parameter_name
-                LIMIT ?
-            """
-            category_filter = f"%{product_category}%" if product_category else ""
-            cursor.execute(query, (category_filter, category_filter, k))
-            patterns = []
-            for row in cursor.fetchall():
-                patterns.append({
-                    "parameter_name": row[0],
-                    "parameter_type": row[1],
-                    "input_method": row[2],
-                    "specifications": row[3] or "",
-                    "options_list": row[4] or "",
-                    "tolerance_limits": row[5] or "",
-                    "measurement_units": row[6] or "",
-                    "has_remarks": bool(row[7]),
-                    "usage_frequency": row[8],
-                    "used_in_products": row[9].split(',') if row[9] else []
-                })
-            return patterns
-        except Exception as e:
-            print(f"Error retrieving parameter patterns: {str(e)}")
-            return []
-        finally:
-            if 'conn' in locals():
-                conn.close()
-    def get_comprehensive_context(self, product_name: str, domain: str = "Food Manufacturing",
-                                 include_patterns: bool = True) -> Dict:
-        """Get comprehensive context from all VDBs"""
-        context = {
-            "product_name": product_name,
-            "domain": domain,
-            "regulatory_requirements": [],
-            "product_specifications": [],
-            "checklist_examples": [],
-            "parameter_patterns": [],
-            "context_summary": {},
-            "generated_at": datetime.now().isoformat()
-        }
-        print(f"Retrieving comprehensive context for: {product_name}")
-        # Get regulatory requirements
-        context["regulatory_requirements"] = self.retrieve_regulatory_requirements(product_name, domain, k=4)
-        # Get product specifications
-        context["product_specifications"] = self.retrieve_product_specifications(product_name, k=3)
-        # Get checklist examples
-        context["checklist_examples"] = self.retrieve_checklist_examples(product_name, k=4)
-        # Get parameter patterns if requested
-        if include_patterns:
-            context["parameter_patterns"] = self.retrieve_parameter_patterns(k=15)
-        # Generate context summary
-        context["context_summary"] = self._generate_context_summary(context)
-        return context
-    def format_context_for_prompt(self, context: Dict, max_length: int = 4000) -> str:
-        """Format comprehensive context for AI prompt"""
-        formatted_context = "\n# RETRIEVED CONTEXT FOR QC CHECKLIST GENERATION:\n"
-        # Add regulatory compliance requirements
-        if context["regulatory_requirements"]:
-            formatted_context += "\n## 🏛️ REGULATORY COMPLIANCE REQUIREMENTS:\n"
-            for i, req in enumerate(context["regulatory_requirements"][:2], 1):
-                clause_ref = req.get('clause_reference', req.get('standard_code', ''))
-                formatted_context += f"\n### {i}. {req['regulatory_body']} - {clause_ref}\n"
-                if req.get('topics'):
-                    formatted_context += f"**Key Topics**: {req['topics'][:100]}...\n"
-                formatted_context += f"**Requirement**: {req['text'][:300]}...\n"
-                if req.get('jurisdiction'):
-                    formatted_context += f"**Jurisdiction**: {req['jurisdiction']}\n"
-        # Add product specification depth reference
-        if context["product_specifications"]:
-            formatted_context += "\n## 📋 PRODUCT SPECIFICATION DEPTH REFERENCE:\n"
-            for i, spec in enumerate(context["product_specifications"][:2], 1):
-                formatted_context += f"\n### {i}. {spec['product_name']} ({spec['supplier']})\n"
-                formatted_context += f"**Detail Level**: {spec['detail_level']} | **Parameters**: {spec['parameters_count']}\n"
-                formatted_context += f"**Example Content**: {spec['text'][:250]}...\n"
-        # Add checklist structure examples
-        if context["checklist_examples"]:
-            formatted_context += "\n## ✅ PROFESSIONAL CHECKLIST EXAMPLES:\n"
-            for i, example in enumerate(context["checklist_examples"][:2], 1):
-                formatted_context += f"\n### {i}. {example['document_type']} - {example['product_name']}\n"
-                formatted_context += f"**Category**: {example['checklist_category']} | **Parameters**: {example['total_parameters']}\n"
-                if example.get('input_methods'):
-                    methods = ', '.join(example['input_methods'][:5])
-                    formatted_context += f"**Input Methods Used**: {methods}\n"
-                if example.get('parameter_structure'):
-                    formatted_context += "**Sample Parameters**:\n"
-                    for param in example['parameter_structure'][:3]:
-                        formatted_context += f"  - {param['name']}: {param['input_method']}"
-                        if param.get('spec'):
-                            formatted_context += f" (Spec: {param['spec']})"
-                        formatted_context += "\n"
-        # Add intelligent parameter guidance
-        if context["parameter_patterns"]:
-            formatted_context += "\n## 🧠 INTELLIGENT PARAMETER GUIDANCE:\n"
-            # Group patterns by input method
-            method_groups = {}
-            for pattern in context["parameter_patterns"][:12]:
-                method = pattern['input_method']
-                if method not in method_groups:
-                    method_groups[method] = []
-                method_groups[method].append(pattern)
-            for method, patterns in method_groups.items():
-                formatted_context += f"\n**{method} Parameters:**\n"
-                for pattern in patterns[:3]:  # Top 3 per method
-                    formatted_context += f"  • {pattern['parameter_name']}"
-                    if pattern['specifications']:
-                        formatted_context += f" (Spec: {pattern['specifications'][:50]})"
-                    if pattern['options_list']:
-                        formatted_context += f" [Options: {pattern['options_list'][:50]}]"
-                    formatted_context += f" - Used {pattern['usage_frequency']}x\n"
-        # Add context summary with specific guidance
-        if context.get("context_summary"):
-            formatted_context += "\n## 🎯 CONTEXT-BASED GUIDANCE:\n"
-            summary = context["context_summary"]
-            if summary.get("regulatory_focus"):
-                formatted_context += f"**Regulatory Focus**: {summary['regulatory_focus']}\n"
-            if summary.get("recommended_sections"):
-                formatted_context += f"**Recommended Sections**: {', '.join(summary['recommended_sections'])}\n"
-            if summary.get("critical_parameters"):
-                formatted_context += f"**Critical Parameters to Include**: {', '.join(summary['critical_parameters'])}\n"
-            if summary.get("input_method_recommendations"):
-                formatted_context += "**Smart Input Method Selection**:\n"
-                for param_type, method in summary['input_method_recommendations'].items():
-                    formatted_context += f"  • {param_type} → {method}\n"
-        # Truncate if too long
-        if len(formatted_context) > max_length:
-            formatted_context = formatted_context[:max_length] + "\n\n[Context truncated for length...]"
-        return formatted_context
-    def _extract_clause_reference(self, metadata: Dict, document_text: str) -> str:
-        """Extract clause reference from regulatory document"""
-        # Try to get from metadata first
-        standard_code = metadata.get('standard_code', '')
-        regulatory_body = metadata.get('regulatory_body', '')
-        # Look for section numbers in the text
-        import re
-        section_patterns = [
-            r"(Section\s+\d+\.\d+[^.]*)",
-            r"(Principle\s+\d+[^.]*)",
-            r"(\d+\.\d+\s+[A-Z][^.]{10,50})",
-            r"(Article\s+\d+[^.]*)",
-        ]
-        for pattern in section_patterns:
-            match = re.search(pattern, document_text[:500])
-            if match:
-                return f"{match.group(1)} ({regulatory_body})"
-        return f"{standard_code} ({regulatory_body})" if standard_code else regulatory_body
-    def _extract_parameter_structure(self, metadata: Dict) -> List[Dict]:
-        """Extract parameter structure info from checklist metadata"""
-        # Basic structure from metadata
-        structure = []
-        param_types = metadata.get('parameter_types', [])
-        input_methods = metadata.get('input_methods', [])
-        # Create sample structure
-        for i, (ptype, method) in enumerate(zip(param_types[:5], input_methods[:5])):
-            structure.append({
-                "name": f"Sample {ptype}",
-                "type": ptype,
-                "input_method": method,
-                "spec": "",
-                "options": []
-            })
-        return structure
-    def _enrich_regulatory_data(self, guidelines: List[Dict]) -> List[Dict]:
-        """Enrich regulatory data with additional metadata from SQLite"""
-        if not self.regulatory_metadata_db.exists():
-            return guidelines
-        try:
-            conn = sqlite3.connect(self.regulatory_metadata_db)
-            cursor = conn.cursor()
-            for guideline in guidelines:
-                # Get additional topics for this regulatory body
-                cursor.execute("""
-                    SELECT topic, relevance_score
-                    FROM key_topics kt
-                    JOIN regulatory_documents rd ON kt.file_hash = rd.file_hash
-                    WHERE rd.regulatory_body = ?
-                    ORDER BY relevance_score DESC
-                    LIMIT 5
-                """, (guideline['regulatory_body'],))
-                topics = cursor.fetchall()
-                if topics:
-                    guideline['key_topics'] = [{"topic": t[0], "relevance": t[1]} for t in topics]
-            return guidelines
-        except Exception as e:
-            print(f"Error enriching regulatory data: {e}")
-            return guidelines
-        finally:
-            if 'conn' in locals():
-                conn.close()
-    def _enrich_checklist_data(self, examples: List[Dict]) -> List[Dict]:
-        """Enrich checklist data with detailed parameter information"""
-        if not self.checklist_metadata_db.exists():
-            return examples
-        try:
-            conn = sqlite3.connect(self.checklist_metadata_db)
-            cursor = conn.cursor()
-            for example in examples:
-                # Get actual parameter details
-                cursor.execute("""
-                    SELECT parameter_name, parameter_type, input_method,
-                           specifications, options_list, tolerance_limits
-                    FROM checklist_parameters cp
-                    JOIN checklist_documents cd ON cp.file_hash = cd.file_hash
-                    WHERE cd.filename = ?
-                    ORDER BY cp.parameter_order
-                    LIMIT 10
-                """, (example.get('text', '')[:50],))  # Approximate match
-                params = cursor.fetchall()
-                if params:
-                    example['detailed_parameters'] = [
-                        {
-                            "name": p[0],
-                            "type": p[1],
-                            "input_method": p[2],
-                            "spec": p[3] or "",
-                            "options": p[4] or "",
-                            "tolerance": p[5] or ""
-                        } for p in params
-                    ]
-            return examples
-        except Exception as e:
-            print(f"Error enriching checklist data: {e}")
-            return examples
-        finally:
-            if 'conn' in locals():
-                conn.close()
-    def _generate_context_summary(self, context: Dict) -> Dict:
-        """Generate intelligent summary of retrieved context"""
-        summary = {
-            "regulatory_focus": "",
-            "recommended_sections": [],
-            "critical_parameters": [],
-            "input_method_recommendations": {},
-            "compliance_requirements": []
-        }
-        # Analyze regulatory requirements
-        if context["regulatory_requirements"]:
-            bodies = [req['regulatory_body'] for req in context["regulatory_requirements"]]
-            if "Dubai Municipality" in bodies:
-                summary["regulatory_focus"] = "Dubai Municipality HACCP Guidelines compliance required"
-            elif "HACCP" in " ".join(bodies):
-                summary["regulatory_focus"] = "HACCP principles implementation required"
-        # Extract recommended sections from examples
-        sections = set()
-        for example in context["checklist_examples"]:
-            category = example.get('checklist_category', '')
-            if category and category != 'General':
-                sections.add(category)
-        summary["recommended_sections"] = list(sections)[:5]
-        # Identify critical parameters from patterns
-        critical_params = []
-        for pattern in context["parameter_patterns"][:10]:
-            if pattern['usage_frequency'] > 1:  # Used multiple times
-                critical_params.append(pattern['parameter_name'])
-        summary["critical_parameters"] = critical_params[:8]
-        # Generate input method recommendations
-        method_mapping = {}
-        for pattern in context["parameter_patterns"]:
-            param_type = pattern['parameter_type']
-            input_method = pattern['input_method']
-            if param_type not in method_mapping:
-                method_mapping[param_type] = input_method
-        summary["input_method_recommendations"] = method_mapping
-        return summary
-# Singleton instance for global use
-rag_utils = EnhancedRAGUtils()
-# Export convenience functions
-def get_comprehensive_context(product_name: str, domain: str = "Food Manufacturing") -> Dict:
-    """Get comprehensive context from all VDBs"""
-    return rag_utils.get_comprehensive_context(product_name, domain)
-def format_context_for_prompt(context: Dict, max_length: int = 4000) -> str:
-    """Format context for AI prompt"""
-    return rag_utils.format_context_for_prompt(context, max_length)
-def retrieve_regulatory_requirements(product_name: str, domain: str = "Food Manufacturing") -> List[Dict]:
-    """Get regulatory requirements"""
-    return rag_utils.retrieve_regulatory_requirements(product_name, domain)
-def retrieve_checklist_examples(product_name: str) -> List[Dict]:
-    """Get checklist examples"""
-    return rag_utils.retrieve_checklist_examples(product_name)
-def retrieve_parameter_patterns(product_category: str = "") -> List[Dict]:
-    """Get parameter patterns"""
-    return rag_utils.retrieve_parameter_patterns(product_category)

+import json
+import sqlite3
+from pathlib import Path
+from typing import List, Dict, Optional, Tuple
+import chromadb
+from chromadb import Settings
+from sentence_transformers import SentenceTransformer
+from datetime import datetime
+class EnhancedRAGUtils:
+    def __init__(self, vector_stores_path: str = "./vector_stores"):
+        self.vector_stores_path = Path(vector_stores_path)
+        # Initialize embedding model (shared across all VDBs)
+        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
+        # Initialize all VDB connections
+        self._init_regulatory_vdb()
+        self._init_product_spec_vdb()
+        self._init_checklist_examples_vdb()
+        print("Enhanced RAG Utils initialized with 3 vector databases")
+    def _init_regulatory_vdb(self):
+        """Initialize regulatory guidelines VDB"""
+        try:
+            self.regulatory_chroma_path = self.vector_stores_path / "chroma_db" / "regulatory_docs"
+            self.regulatory_metadata_db = self.regulatory_chroma_path / "metadata" / "regulatory_metadata.db"
+            self.regulatory_client = chromadb.PersistentClient(
+                path=str(self.regulatory_chroma_path),
+                settings=Settings(anonymized_telemetry=False)
+            )
+            self.regulatory_collection = self.regulatory_client.get_collection("regulatory_guidelines")
+            print("✓ Regulatory VDB connected")
+        except Exception as e:
+            print(f"⚠ Regulatory VDB not available: {e}")
+            self.regulatory_collection = None
+    def _init_product_spec_vdb(self):
+        """Initialize product specifications VDB"""
+        try:
+            self.product_spec_chroma_path = self.vector_stores_path / "chroma_db" / "product_specs"
+            self.product_spec_metadata_db = self.product_spec_chroma_path / "metadata" / "product_metadata.db"
+            self.product_spec_client = chromadb.PersistentClient(
+                path=str(self.product_spec_chroma_path),
+                settings=Settings(anonymized_telemetry=False)
+            )
+            self.product_spec_collection = self.product_spec_client.get_collection("product_specifications")
+            print("✓ Product Specifications VDB connected")
+        except Exception as e:
+            print(f"⚠ Product Specifications VDB not available: {e}")
+            self.product_spec_collection = None
+    def _init_checklist_examples_vdb(self):
+        """Initialize checklist examples VDB"""
+        try:
+            self.checklist_chroma_path = self.vector_stores_path / "chroma_db" / "checklist_examples"
+            self.checklist_metadata_db = self.checklist_chroma_path / "metadata" / "checklist_structures.db"
+            self.checklist_client = chromadb.PersistentClient(
+                path=str(self.checklist_chroma_path),
+                settings=Settings(anonymized_telemetry=False)
+            )
+            self.checklist_collection = self.checklist_client.get_collection("checklist_examples")
+            print("✓ Checklist Examples VDB connected")
+        except Exception as e:
+            print(f"⚠ Checklist Examples VDB not available: {e}")
+            self.checklist_collection = None
+    def retrieve_regulatory_requirements(self, product_name: str, domain: str = "Food Manufacturing", k: int = 3) -> List[Dict]:
+        """Retrieve relevant regulatory requirements with clause references"""
+        if not self.regulatory_collection:
+            return []
+        try:
+            # Create targeted query
+            query_text = f"{product_name} {domain} regulatory requirements compliance standards Dubai UAE HACCP"
+            query_embedding = self.embedder.encode(query_text).tolist()
+            # Query ChromaDB
+            results = self.regulatory_collection.query(
+                query_embeddings=[query_embedding],
+                n_results=k
+            )
+            guidelines = []
+            if results['documents'] and results['documents'][0]:
+                for i, doc in enumerate(results['documents'][0]):
+                    metadata = results['metadatas'][0][i]
+                    # Get clause reference from metadata
+                    clause_ref = self._extract_clause_reference(metadata, doc)
+                    guidelines.append({
+                        "text": doc[:800],  # Limit text length
+                        "regulatory_body": metadata.get('regulatory_body', 'Unknown'),
+                        "standard_code": metadata.get('standard_code', ''),
+                        "clause_reference": clause_ref,
+                        "topics": metadata.get('topics', ''),
+                        "jurisdiction": metadata.get('jurisdiction', 'UAE'),
+                        "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0.5,
+                        "source_type": "regulatory"
+                    })
+            # Sort by relevance and get additional metadata from SQLite
+            guidelines = sorted(guidelines, key=lambda x: x['relevance_score'], reverse=True)
+            return self._enrich_regulatory_data(guidelines)
+        except Exception as e:
+            print(f"Error retrieving regulatory requirements: {str(e)}")
+            return []
+    def retrieve_product_specifications(self, product_name: str, k: int = 3) -> List[Dict]:
+        """Retrieve similar product specifications for depth reference"""
+        if not self.product_spec_collection:
+            return []
+        try:
+            # Create product-focused query
+            query_text = f"{product_name} product specification quality parameters tolerance limits"
+            query_embedding = self.embedder.encode(query_text).tolist()
+            # Query ChromaDB
+            results = self.product_spec_collection.query(
+                query_embeddings=[query_embedding],
+                n_results=k
+            )
+            specifications = []
+            if results['documents'] and results['documents'][0]:
+                for i, doc in enumerate(results['documents'][0]):
+                    metadata = results['metadatas'][0][i]
+                    specifications.append({
+                        "text": doc[:600],
+                        "product_name": metadata.get('product_name', 'Unknown'),
+                        "supplier": metadata.get('supplier', 'Unknown'),
+                        "category": metadata.get('product_category', 'Unknown'),
+                        "specification_type": metadata.get('specification_type', 'Unknown'),
+                        "parameters_count": metadata.get('total_parameters', 0),
+                        "detail_level": metadata.get('detail_level', 'standard'),
+                        "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0.5,
+                        "source_type": "product_spec"
+                    })
+            return sorted(specifications, key=lambda x: x['relevance_score'], reverse=True)
+        except Exception as e:
+            print(f"Error retrieving product specifications: {str(e)}")
+            return []
+    def retrieve_checklist_examples(self, product_name: str, k: int = 3) -> List[Dict]:
+        """Retrieve similar checklist examples with parameter structures"""
+        if not self.checklist_collection:
+            return []
+        try:
+            # Create checklist-focused query
+            query_text = f"{product_name} quality control inspection checklist parameters"
+            query_embedding = self.embedder.encode(query_text).tolist()
+            # Query ChromaDB
+            results = self.checklist_collection.query(
+                query_embeddings=[query_embedding],
+                n_results=k
+            )
+            examples = []
+            if results['documents'] and results['documents'][0]:
+                for i, doc in enumerate(results['documents'][0]):
+                    metadata = results['metadatas'][0][i]
+                    # Get parameter structures from metadata
+                    parameter_info = self._extract_parameter_structure(metadata)
+                    examples.append({
+                        "text": doc[:500],
+                        "document_type": metadata.get('document_type', 'QC Checklist'),
+                        "product_name": metadata.get('product_name', 'Unknown'),
+                        "checklist_category": metadata.get('checklist_category', 'General'),
+                        "total_parameters": metadata.get('total_parameters', 0),
+                        "parameter_types": metadata.get('parameter_types', []),
+                        "input_methods": metadata.get('input_methods', []),
+                        "parameter_structure": parameter_info,
+                        "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0.5,
+                        "source_type": "checklist_example"
+                    })
+            # Enrich with detailed parameter data from SQLite
+            return self._enrich_checklist_data(examples)
+        except Exception as e:
+            print(f"Error retrieving checklist examples: {str(e)}")
+            return []
+    def retrieve_parameter_patterns(self, product_category: str = "", k: int = 10) -> List[Dict]:
+        """Retrieve common parameter patterns for intelligent type selection"""
+        if not self.checklist_metadata_db.exists():
+            return []
+        try:
+            conn = sqlite3.connect(self.checklist_metadata_db)
+            cursor = conn.cursor()
+            # Get parameter patterns with usage statistics
+            query = """
+                SELECT
+                    cp.parameter_name,
+                    cp.parameter_type,
+                    cp.input_method,
+                    cp.specifications,
+                    cp.options_list,
+                    cp.tolerance_limits,
+                    cp.measurement_units,
+                    cp.has_remarks,
+                    COUNT(*) as usage_frequency,
+                    GROUP_CONCAT(DISTINCT cd.product_name) as used_in_products
+                FROM checklist_parameters cp
+                JOIN checklist_documents cd ON cp.file_hash = cd.file_hash
+                WHERE (? = '' OR cd.checklist_category LIKE ?)
+                GROUP BY cp.parameter_name, cp.parameter_type, cp.input_method
+                ORDER BY usage_frequency DESC, cp.parameter_name
+                LIMIT ?
+            """
+            category_filter = f"%{product_category}%" if product_category else ""
+            cursor.execute(query, (category_filter, category_filter, k))
+            patterns = []
+            for row in cursor.fetchall():
+                patterns.append({
+                    "parameter_name": row[0],
+                    "parameter_type": row[1],
+                    "input_method": row[2],
+                    "specifications": row[3] or "",
+                    "options_list": row[4] or "",
+                    "tolerance_limits": row[5] or "",
+                    "measurement_units": row[6] or "",
+                    "has_remarks": bool(row[7]),
+                    "usage_frequency": row[8],
+                    "used_in_products": row[9].split(',') if row[9] else []
+                })
+            return patterns
+        except Exception as e:
+            print(f"Error retrieving parameter patterns: {str(e)}")
+            return []
+        finally:
+            if 'conn' in locals():
+                conn.close()
+    def get_comprehensive_context(self, product_name: str, domain: str = "Food Manufacturing",
+                                 include_patterns: bool = True) -> Dict:
+        """Get comprehensive context from all VDBs"""
+        context = {
+            "product_name": product_name,
+            "domain": domain,
+            "regulatory_requirements": [],
+            "product_specifications": [],
+            "checklist_examples": [],
+            "parameter_patterns": [],
+            "context_summary": {},
+            "generated_at": datetime.now().isoformat()
+        }
+        print(f"Retrieving comprehensive context for: {product_name}")
+        # Get regulatory requirements
+        context["regulatory_requirements"] = self.retrieve_regulatory_requirements(product_name, domain, k=4)
+        # Get product specifications
+        context["product_specifications"] = self.retrieve_product_specifications(product_name, k=3)
+        # Get checklist examples
+        context["checklist_examples"] = self.retrieve_checklist_examples(product_name, k=4)
+        # Get parameter patterns if requested
+        if include_patterns:
+            context["parameter_patterns"] = self.retrieve_parameter_patterns(k=15)
+        # Generate context summary
+        context["context_summary"] = self._generate_context_summary(context)
+        return context
+    def format_context_for_prompt(self, context: Dict, max_length: int = 4000) -> str:
+        """Format comprehensive context for AI prompt"""
+        formatted_context = "\n# RETRIEVED CONTEXT FOR QC CHECKLIST GENERATION:\n"
+        # Add regulatory compliance requirements
+        if context["regulatory_requirements"]:
+            formatted_context += "\n## 🏛️ REGULATORY COMPLIANCE REQUIREMENTS:\n"
+            for i, req in enumerate(context["regulatory_requirements"][:2], 1):
+                clause_ref = req.get('clause_reference', req.get('standard_code', ''))
+                formatted_context += f"\n### {i}. {req['regulatory_body']} - {clause_ref}\n"
+                if req.get('topics'):
+                    formatted_context += f"**Key Topics**: {req['topics'][:100]}...\n"
+                formatted_context += f"**Requirement**: {req['text'][:300]}...\n"
+                if req.get('jurisdiction'):
+                    formatted_context += f"**Jurisdiction**: {req['jurisdiction']}\n"
+        # Add product specification depth reference
+        if context["product_specifications"]:
+            formatted_context += "\n## 📋 PRODUCT SPECIFICATION DEPTH REFERENCE:\n"
+            for i, spec in enumerate(context["product_specifications"][:2], 1):
+                formatted_context += f"\n### {i}. {spec['product_name']} ({spec['supplier']})\n"
+                formatted_context += f"**Detail Level**: {spec['detail_level']} | **Parameters**: {spec['parameters_count']}\n"
+                formatted_context += f"**Example Content**: {spec['text'][:250]}...\n"
+        # Add checklist structure examples
+        if context["checklist_examples"]:
+            formatted_context += "\n## ✅ PROFESSIONAL CHECKLIST EXAMPLES:\n"
+            for i, example in enumerate(context["checklist_examples"][:2], 1):
+                formatted_context += f"\n### {i}. {example['document_type']} - {example['product_name']}\n"
+                formatted_context += f"**Category**: {example['checklist_category']} | **Parameters**: {example['total_parameters']}\n"
+                if example.get('input_methods'):
+                    methods = ', '.join(example['input_methods'][:5])
+                    formatted_context += f"**Input Methods Used**: {methods}\n"
+                if example.get('parameter_structure'):
+                    formatted_context += "**Sample Parameters**:\n"
+                    for param in example['parameter_structure'][:3]:
+                        formatted_context += f"  - {param['name']}: {param['input_method']}"
+                        if param.get('spec'):
+                            formatted_context += f" (Spec: {param['spec']})"
+                        formatted_context += "\n"
+        # Add intelligent parameter guidance
+        if context["parameter_patterns"]:
+            formatted_context += "\n## 🧠 INTELLIGENT PARAMETER GUIDANCE:\n"
+            # Group patterns by input method
+            method_groups = {}
+            for pattern in context["parameter_patterns"][:12]:
+                method = pattern['input_method']
+                if method not in method_groups:
+                    method_groups[method] = []
+                method_groups[method].append(pattern)
+            for method, patterns in method_groups.items():
+                formatted_context += f"\n**{method} Parameters:**\n"
+                for pattern in patterns[:3]:  # Top 3 per method
+                    formatted_context += f"  • {pattern['parameter_name']}"
+                    if pattern['specifications']:
+                        formatted_context += f" (Spec: {pattern['specifications'][:50]})"
+                    if pattern['options_list']:
+                        formatted_context += f" [Options: {pattern['options_list'][:50]}]"
+                    formatted_context += f" - Used {pattern['usage_frequency']}x\n"
+        # Add context summary with specific guidance
+        if context.get("context_summary"):
+            formatted_context += "\n## 🎯 CONTEXT-BASED GUIDANCE:\n"
+            summary = context["context_summary"]
+            if summary.get("regulatory_focus"):
+                formatted_context += f"**Regulatory Focus**: {summary['regulatory_focus']}\n"
+            if summary.get("recommended_sections"):
+                formatted_context += f"**Recommended Sections**: {', '.join(summary['recommended_sections'])}\n"
+            if summary.get("critical_parameters"):
+                formatted_context += f"**Critical Parameters to Include**: {', '.join(summary['critical_parameters'])}\n"
+            if summary.get("input_method_recommendations"):
+                formatted_context += "**Smart Input Method Selection**:\n"
+                for param_type, method in summary['input_method_recommendations'].items():
+                    formatted_context += f"  • {param_type} → {method}\n"
+        # Truncate if too long
+        if len(formatted_context) > max_length:
+            formatted_context = formatted_context[:max_length] + "\n\n[Context truncated for length...]"
+        return formatted_context
+    def _extract_clause_reference(self, metadata: Dict, document_text: str) -> str:
+        """Extract clause reference from regulatory document"""
+        # Try to get from metadata first
+        standard_code = metadata.get('standard_code', '')
+        regulatory_body = metadata.get('regulatory_body', '')
+        # Look for section numbers in the text
+        import re
+        section_patterns = [
+            r"(Section\s+\d+\.\d+[^.]*)",
+            r"(Principle\s+\d+[^.]*)",
+            r"(\d+\.\d+\s+[A-Z][^.]{10,50})",
+            r"(Article\s+\d+[^.]*)",
+        ]
+        for pattern in section_patterns:
+            match = re.search(pattern, document_text[:500])
+            if match:
+                return f"{match.group(1)} ({regulatory_body})"
+        return f"{standard_code} ({regulatory_body})" if standard_code else regulatory_body
+    def _extract_parameter_structure(self, metadata: Dict) -> List[Dict]:
+        """Extract parameter structure info from checklist metadata"""
+        # Basic structure from metadata
+        structure = []
+        param_types = metadata.get('parameter_types', [])
+        input_methods = metadata.get('input_methods', [])
+        # Create sample structure
+        for i, (ptype, method) in enumerate(zip(param_types[:5], input_methods[:5])):
+            structure.append({
+                "name": f"Sample {ptype}",
+                "type": ptype,
+                "input_method": method,
+                "spec": "",
+                "options": []
+            })
+        return structure
+    def _enrich_regulatory_data(self, guidelines: List[Dict]) -> List[Dict]:
+        """Enrich regulatory data with additional metadata from SQLite"""
+        if not self.regulatory_metadata_db.exists():
+            return guidelines
+        try:
+            conn = sqlite3.connect(self.regulatory_metadata_db)
+            cursor = conn.cursor()
+            for guideline in guidelines:
+                # Get additional topics for this regulatory body
+                cursor.execute("""
+                    SELECT topic, relevance_score
+                    FROM key_topics kt
+                    JOIN regulatory_documents rd ON kt.file_hash = rd.file_hash
+                    WHERE rd.regulatory_body = ?
+                    ORDER BY relevance_score DESC
+                    LIMIT 5
+                """, (guideline['regulatory_body'],))
+                topics = cursor.fetchall()
+                if topics:
+                    guideline['key_topics'] = [{"topic": t[0], "relevance": t[1]} for t in topics]
+            return guidelines
+        except Exception as e:
+            print(f"Error enriching regulatory data: {e}")
+            return guidelines
+        finally:
+            if 'conn' in locals():
+                conn.close()
+    def _enrich_checklist_data(self, examples: List[Dict]) -> List[Dict]:
+        """Enrich checklist data with detailed parameter information"""
+        if not self.checklist_metadata_db.exists():
+            return examples
+        try:
+            conn = sqlite3.connect(self.checklist_metadata_db)
+            cursor = conn.cursor()
+            for example in examples:
+                # Get actual parameter details
+                cursor.execute("""
+                    SELECT parameter_name, parameter_type, input_method,
+                           specifications, options_list, tolerance_limits
+                    FROM checklist_parameters cp
+                    JOIN checklist_documents cd ON cp.file_hash = cd.file_hash
+                    WHERE cd.filename = ?
+                    ORDER BY cp.parameter_order
+                    LIMIT 10
+                """, (example.get('text', '')[:50],))  # Approximate match
+                params = cursor.fetchall()
+                if params:
+                    example['detailed_parameters'] = [
+                        {
+                            "name": p[0],
+                            "type": p[1],
+                            "input_method": p[2],
+                            "spec": p[3] or "",
+                            "options": p[4] or "",
+                            "tolerance": p[5] or ""
+                        } for p in params
+                    ]
+            return examples
+        except Exception as e:
+            print(f"Error enriching checklist data: {e}")
+            return examples
+        finally:
+            if 'conn' in locals():
+                conn.close()
+    def _generate_context_summary(self, context: Dict) -> Dict:
+        """Generate intelligent summary of retrieved context"""
+        summary = {
+            "regulatory_focus": "",
+            "recommended_sections": [],
+            "critical_parameters": [],
+            "input_method_recommendations": {},
+            "compliance_requirements": []
+        }
+        # Analyze regulatory requirements
+        if context["regulatory_requirements"]:
+            bodies = [req['regulatory_body'] for req in context["regulatory_requirements"]]
+            if "Dubai Municipality" in bodies:
+                summary["regulatory_focus"] = "Dubai Municipality HACCP Guidelines compliance required"
+            elif "HACCP" in " ".join(bodies):
+                summary["regulatory_focus"] = "HACCP principles implementation required"
+        # Extract recommended sections from examples
+        sections = set()
+        for example in context["checklist_examples"]:
+            category = example.get('checklist_category', '')
+            if category and category != 'General':
+                sections.add(category)
+        summary["recommended_sections"] = list(sections)[:5]
+        # Identify critical parameters from patterns
+        critical_params = []
+        for pattern in context["parameter_patterns"][:10]:
+            if pattern['usage_frequency'] > 1:  # Used multiple times
+                critical_params.append(pattern['parameter_name'])
+        summary["critical_parameters"] = critical_params[:8]
+        # Generate input method recommendations
+        method_mapping = {}
+        for pattern in context["parameter_patterns"]:
+            param_type = pattern['parameter_type']
+            input_method = pattern['input_method']
+            if param_type not in method_mapping:
+                method_mapping[param_type] = input_method
+        summary["input_method_recommendations"] = method_mapping
+        return summary
+# Singleton instance for global use
+rag_utils = EnhancedRAGUtils()
+# Export convenience functions
+def get_comprehensive_context(product_name: str, domain: str = "Food Manufacturing") -> Dict:
+    """Get comprehensive context from all VDBs"""
+    return rag_utils.get_comprehensive_context(product_name, domain)
+def format_context_for_prompt(context: Dict, max_length: int = 4000) -> str:
+    """Format context for AI prompt"""
+    return rag_utils.format_context_for_prompt(context, max_length)
+def retrieve_regulatory_requirements(product_name: str, domain: str = "Food Manufacturing") -> List[Dict]:
+    """Get regulatory requirements"""
+    return rag_utils.retrieve_regulatory_requirements(product_name, domain)
+def retrieve_checklist_examples(product_name: str) -> List[Dict]:
+    """Get checklist examples"""
+    return rag_utils.retrieve_checklist_examples(product_name)
+def retrieve_parameter_patterns(product_category: str = "") -> List[Dict]:
+    """Get parameter patterns"""
+    return rag_utils.retrieve_parameter_patterns(product_category)