Spaces:

Rivalcoder
/

Legal-assistant

Running

App Files Files Community

Rivalcoder commited on Sep 24

Commit

074a13b

verified ·

1 Parent(s): 1a1eddd

Update process_aware_rag.py

Browse files

Files changed (1) hide show

process_aware_rag.py +72 -131

process_aware_rag.py CHANGED Viewed

@@ -20,31 +20,28 @@ class ProcessAwareRAG:
         self.legal_graph = LegalProcessGraph()
         self.legal_graph.load_graph('legal_processes.pkl')
-        # Initialize vector store (use writable path by default)
         chroma_path = os.getenv('CHROMA_DB_PATH', '/tmp/legal_vector_db')
         os.makedirs(chroma_path, exist_ok=True)
-        # Redirect model caches to writable directories
         default_cache_root = os.getenv('CACHE_ROOT', '/data/cache')
         os.environ.setdefault('HOME', '/data')
         os.makedirs(default_cache_root, exist_ok=True)
-        os.makedirs(os.path.join(os.environ['HOME'], '.cache'), exist_ok=True)
-        os.makedirs(os.path.join(os.environ['HOME'], '.cache', 'chroma'), exist_ok=True)
-        os.environ.setdefault('HF_HOME', os.path.join(default_cache_root, 'hf'))
-        os.environ.setdefault('TRANSFORMERS_CACHE', os.path.join(default_cache_root, 'transformers'))
-        os.environ.setdefault('SENTENCE_TRANSFORMERS_HOME', os.path.join(default_cache_root, 'sentence-transformers'))
-        os.environ.setdefault('XDG_CACHE_HOME', default_cache_root)
-        for env_key in ['HF_HOME', 'TRANSFORMERS_CACHE', 'SENTENCE_TRANSFORMERS_HOME', 'XDG_CACHE_HOME']:
             os.makedirs(os.environ[env_key], exist_ok=True)
-        # Disable Chroma anonymized telemetry and initialize client
         client = chromadb.PersistentClient(
             path=chroma_path,
             settings=Settings(anonymized_telemetry=False)
         )
         # Ensure collection exists
-        # Use explicit embedding function to ensure queries can compute embeddings
         embedding_function = embedding_functions.DefaultEmbeddingFunction()
         try:
             self.vector_collection = client.get_collection(
@@ -56,57 +53,40 @@ class ProcessAwareRAG:
                 "legal_context",
                 embedding_function=embedding_function
             )
-        # Initialize LLM
         genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
         self.llm = genai.GenerativeModel('gemini-2.5-flash-lite')
     def retrieve_graph_context(self, classification: Dict) -> Dict:
-        """Retrieve relevant context from knowledge graph"""
         graph_context = {
             'current_step': None,
             'next_steps': [],
             'resources': [],
             'process_overview': None
         }
         if classification['process'] == 'general':
             return graph_context
         process_name = classification['process_name']
-        # Find current step or process start
-        if classification['step']:
-            current_step_id = classification['step']
-        else:
-            current_step_id = self.legal_graph.find_process_start(process_name)
-        if current_step_id:
-            # Get current step info
-            if current_step_id in self.legal_graph.graph.nodes:
-                graph_context['current_step'] = {
-                    'id': current_step_id,
-                    **self.legal_graph.graph.nodes[current_step_id]
-                }
-                # Get next steps
-                graph_context['next_steps'] = self.legal_graph.get_next_steps(current_step_id)
-                # Get relevant resources
-                graph_context['resources'] = self.legal_graph.get_required_resources(current_step_id)
         return graph_context
     def retrieve_vector_context(self, user_query: str, classification: Dict) -> List[Dict]:
-        """Retrieve relevant context from vector store"""
-        # Query vector store
         results = self.vector_collection.query(
             query_texts=[user_query],
             n_results=3,
             where={"process": classification.get('process_name', '')} if classification.get('process_name') else None
         )
         vector_context = []
         if results['documents']:
             for i in range(len(results['documents'][0])):
@@ -115,109 +95,74 @@ class ProcessAwareRAG:
                     'metadata': results['metadatas'][0][i],
                     'distance': results['distances'][0][i] if 'distances' in results else None
                 })
         return vector_context
-    def generate_response(self, user_query: str, graph_context: Dict, vector_context: List[Dict], classification: Dict) -> str:
-        """Generate comprehensive response using LLM"""
-        # Build structured prompt
         system_prompt = """
-        You are a helpful, empathetic, and precise legal guide for Indian legal processes.
-        IMPORTANT GUIDELINES:
-        - You must NEVER give legal advice, only guide users through official processes
-        - Always stress that users should consult a qualified lawyer for their specific case
-        - Provide specific, actionable steps when possible
-        - Include official links, phone numbers, and government portals when available
-        - Be empathetic and understanding of the user's situation
-        - Use clear, simple language avoiding legal jargon
-        - Give In The User Language What User Is Communicating or Giving Questions SO Answer In That Language Accordingly
-        - And Must be Short and Clear Not To give Like long Long Para answers Answer must be Short and Clear
-        - Dont Give Large Answer Give Answer In One or two three Lines answers Give Accordingly
         """
-        # Prepare context information
         context_sections = []
-        # Add graph context
         if graph_context['current_step']:
             context_sections.append(f"""
-            CURRENT PROCESS STEP:
-            Title: {graph_context['current_step']['title']}
-            Description: {graph_context['current_step']['description']}
-            Properties: {graph_context['current_step'].get('properties', {})}
             """)
         if graph_context['next_steps']:
-            next_steps_text = "\n".join([
-                f"- {step['title']}: {step['description']}"
-                for step in graph_context['next_steps']
-            ])
-            context_sections.append(f"""
-            NEXT STEPS:
-            {next_steps_text}
-            """)
         if graph_context['resources']:
-            resources_text = "\n".join([
-                f"- {res['title']} ({res['type']}): {res['properties'].get('url', res['properties'].get('phone', 'Contact available'))}"
-                for res in graph_context['resources']
-            ])
-            context_sections.append(f"""
-            RELEVANT RESOURCES:
-            {resources_text}
-            """)
-        # Add vector context
         if vector_context:
             vector_text = "\n\n".join([doc['content'] for doc in vector_context])
-            context_sections.append(f"""
-            ADDITIONAL CONTEXT:
-            {vector_text}
-            """)
-        # Build final prompt
         full_prompt = f"""
         {system_prompt}
         USER QUERY: "{user_query}"
-        CLASSIFICATION: Process: {classification.get('process_name', 'General')}, Intent: {classification.get('intent', 'information')}
         {chr(10).join(context_sections)}
-        Please provide a helpful, structured response that:
-        1. Acknowledges the user's situation empathetically
-        2. Provides specific next steps if this is a process guidance request
-        3. Includes relevant official links and contact information
-        4. Reminds the user to consult a lawyer for specific legal advice
-        5. Uses bullet points, bold formatting, and clear structure
-        Format your response with clear sections and actionable information.
         """
         try:
             response = self.llm.generate_content(full_prompt)
             return response.text
         except Exception as e:
-            return f"I apologize, but I'm having trouble generating a response right now. Please try again or contact NALSA directly at nalsa-dla@nic.in for legal aid queries. Error: {str(e)}"
     def process_query(self, user_query: str) -> Dict[str, Any]:
-        """Main pipeline: process user query end-to-end"""
-        # Step 1: Classify query
         classification = self.classifier.classify_query(user_query)
-        # Step 2: Retrieve graph context
         graph_context = self.retrieve_graph_context(classification)
-        # Step 3: Retrieve vector context
         vector_context = self.retrieve_vector_context(user_query, classification)
-        # Step 4: Generate response
         response = self.generate_response(user_query, graph_context, vector_context, classification)
         return {
             'response': response,
             'classification': classification,
@@ -230,19 +175,15 @@ class ProcessAwareRAG:
             }
         }
-# Test the complete pipeline
 if __name__ == "__main__":
     rag_system = ProcessAwareRAG()
-    test_query = "I need free legal help but I'm not sure if I qualify. My monthly income is around 45000 rupees."
     result = rag_system.process_query(test_query)
     print("=== QUERY ===")
     print(test_query)
-    print("\n=== RESPONSE ===")
     print(result['response'])
     print("\n=== DEBUG INFO ===")
-    print(f"Process: {result['debug_info']['process_identified']}")
-    print(f"Graph nodes: {result['debug_info']['graph_nodes_found']}")
-    print(f"Vector docs: {result['debug_info']['vector_docs_found']}")

         self.legal_graph = LegalProcessGraph()
         self.legal_graph.load_graph('legal_processes.pkl')
+        # Initialize vector store
         chroma_path = os.getenv('CHROMA_DB_PATH', '/tmp/legal_vector_db')
         os.makedirs(chroma_path, exist_ok=True)
+        # Redirect model caches
         default_cache_root = os.getenv('CACHE_ROOT', '/data/cache')
         os.environ.setdefault('HOME', '/data')
         os.makedirs(default_cache_root, exist_ok=True)
+        for env_key in [
+            'HF_HOME', 'TRANSFORMERS_CACHE',
+            'SENTENCE_TRANSFORMERS_HOME', 'XDG_CACHE_HOME'
+        ]:
+            os.environ.setdefault(env_key, os.path.join(default_cache_root, env_key.lower()))
             os.makedirs(os.environ[env_key], exist_ok=True)
+        # Disable Chroma telemetry & init client
         client = chromadb.PersistentClient(
             path=chroma_path,
             settings=Settings(anonymized_telemetry=False)
         )
         # Ensure collection exists
         embedding_function = embedding_functions.DefaultEmbeddingFunction()
         try:
             self.vector_collection = client.get_collection(
                 "legal_context",
                 embedding_function=embedding_function
             )
+        # Init LLM
         genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
         self.llm = genai.GenerativeModel('gemini-2.5-flash-lite')
     def retrieve_graph_context(self, classification: Dict) -> Dict:
         graph_context = {
             'current_step': None,
             'next_steps': [],
             'resources': [],
             'process_overview': None
         }
         if classification['process'] == 'general':
             return graph_context
         process_name = classification['process_name']
+        current_step_id = classification['step'] or self.legal_graph.find_process_start(process_name)
+        if current_step_id and current_step_id in self.legal_graph.graph.nodes:
+            graph_context['current_step'] = {
+                'id': current_step_id,
+                **self.legal_graph.graph.nodes[current_step_id]
+            }
+            graph_context['next_steps'] = self.legal_graph.get_next_steps(current_step_id)
+            graph_context['resources'] = self.legal_graph.get_required_resources(current_step_id)
         return graph_context
     def retrieve_vector_context(self, user_query: str, classification: Dict) -> List[Dict]:
         results = self.vector_collection.query(
             query_texts=[user_query],
             n_results=3,
             where={"process": classification.get('process_name', '')} if classification.get('process_name') else None
         )
         vector_context = []
         if results['documents']:
             for i in range(len(results['documents'][0])):
                     'metadata': results['metadatas'][0][i],
                     'distance': results['distances'][0][i] if 'distances' in results else None
                 })
         return vector_context
+    def generate_response(self, user_query: str, graph_context: Dict,
+                          vector_context: List[Dict], classification: Dict) -> str:
+        """
+        Generate Indian Law Assistant responses:
+        - Normal legal Q&A: Short, clear answers
+        - Complex / case-specific: Say consult professional
+        - If vector data is wrong: Override with correct info
+        """
         system_prompt = """
+        You are an **Indian Law Assistant**.
+        RULES:
+        - For general questions on Indian law (IPC, CrPC, FIR, consumer law, bail, etc.), answer briefly (1–3 lines).
+        - Use **sections of Indian laws (IPC, CrPC, Evidence Act, etc.)** where relevant.
+        - If query is very **complex / case-dependent**, politely suggest consulting a qualified lawyer, but still share general process info.
+        - If retrieved data looks irrelevant or wrong, override it with correct general legal knowledge of Indian laws.
+        - Be empathetic, clear, and concise.
+        - Use the same language as the user (Hindi/English mix if needed).
+        - Avoid long paragraphs. Use short, crisp answers with bullet points where possible.
         """
+        # Build context
         context_sections = []
         if graph_context['current_step']:
             context_sections.append(f"""
+            CURRENT STEP:
+            {graph_context['current_step']['title']} - {graph_context['current_step']['description']}
             """)
         if graph_context['next_steps']:
+            context_sections.append("NEXT STEPS:\n" + "\n".join([
+                f"- {s['title']}: {s['description']}" for s in graph_context['next_steps']
+            ]))
         if graph_context['resources']:
+            context_sections.append("RESOURCES:\n" + "\n".join([
+                f"- {r['title']} ({r['type']}): {r['properties'].get('url', r['properties'].get('phone', 'Contact available'))}"
+                for r in graph_context['resources']
+            ]))
         if vector_context:
             vector_text = "\n\n".join([doc['content'] for doc in vector_context])
+            context_sections.append("REFERENCE CONTEXT:\n" + vector_text)
         full_prompt = f"""
         {system_prompt}
         USER QUERY: "{user_query}"
+        CLASSIFICATION: Process: {classification.get('process_name', 'General')} | Intent: {classification.get('intent', 'information')}
+        CONTEXT:
         {chr(10).join(context_sections)}
+        Generate the best possible short, accurate, and user-friendly response.
         """
         try:
             response = self.llm.generate_content(full_prompt)
             return response.text
         except Exception as e:
+            return f"⚠️ System error. Please try again later or contact NALSA (nalsa-dla@nic.in). Error: {str(e)}"
     def process_query(self, user_query: str) -> Dict[str, Any]:
         classification = self.classifier.classify_query(user_query)
         graph_context = self.retrieve_graph_context(classification)
         vector_context = self.retrieve_vector_context(user_query, classification)
         response = self.generate_response(user_query, graph_context, vector_context, classification)
         return {
             'response': response,
             'classification': classification,
             }
         }
 if __name__ == "__main__":
     rag_system = ProcessAwareRAG()
+    test_query = "Under which IPC section is cheating punishable in India?"
     result = rag_system.process_query(test_query)
     print("=== QUERY ===")
     print(test_query)
+    print("\n=== RESPONSE ===")
     print(result['response'])
     print("\n=== DEBUG INFO ===")
+    print(result['debug_info'])