Spaces:

Zeggai
/

AgenticRAG

Sleeping

App Files Files Community

Zeggai Abdellah commited on Jun 4, 2025

Commit

1817834

1 Parent(s): 8355f0c

add number fo the citation

Browse files

Files changed (1) hide show

rag_pipeline.py +99 -42

rag_pipeline.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-RAG Pipeline for vaccine assistant
-Handles agent creation and question answering
 """
 import json
@@ -46,13 +46,13 @@ def extract_source_ids(response_text):
         ids = [id_str.strip() for id_str in citation.split(',')]
         all_ids.extend(ids)
-    # Get unique source IDs
-    source_ids = list(set(all_ids))
-    # Filter out any non-UUID-like IDs (if needed)
-    # This is now optional as we're handling various source ID formats
-    # uuid_pattern = r'^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$'
-    # source_ids = [source_id for source_id in source_ids if re.match(uuid_pattern, source_id, re.IGNORECASE)]
     if not source_ids:
         print("Warning: No valid source IDs found after filtering.")
@@ -61,6 +61,41 @@ def extract_source_ids(response_text):
     return source_ids
 def create_custom_prompt():
     """Create custom prompt with medical assistant instructions"""
@@ -240,9 +275,9 @@ def process_question(agent, question: str) -> str:
         print(f"Error processing question: {e}")
         return f"Error processing your question: {str(e)}"
-def process_question_with_citations(agent, question: str, chunks_directory="./data/") -> dict:
     """
-    Process a question through the RAG pipeline and extract cited elements.
     Args:
         agent: The initialized RAG agent
@@ -251,9 +286,10 @@ def process_question_with_citations(agent, question: str, chunks_directory="./da
     Returns:
         dict: {
-            "response": str,
-            "cited_elements_json": str,
-            "unique_ids": list
         }
     """
     try:
@@ -261,48 +297,69 @@ def process_question_with_citations(agent, question: str, chunks_directory="./da
         response = agent.chat(question)
         response_text = response.response
-        # Extract source IDs from the response
         unique_ids = extract_source_ids(response_text)
         # Load all chunks data to find cited elements
         all_chunks_data = []
-        # the ids is only in the two main files, so we can load them all at once
-        min_chunks_files = ["Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.json", "Immunization in Practice_WHO_eng_2015.json"]
         for json_file in min_chunks_files:
-            if json_file.endswith('.json'):
-                json_path = os.path.join(chunks_directory, json_file)
-                try:
-                    with open(json_path, "r", encoding="utf-8") as f:
-                        chunks_data = json.load(f)
-                        all_chunks_data.extend(chunks_data)
-                except Exception as e:
-                    print(f"Warning: Could not load {json_file}: {e}")
-        # Get only the cited elements
-        cited_elements = []
-        for element in all_chunks_data:
-            if element.get("type") =='TableElement':
-               if element.get("element_id") in unique_ids:
-                 cited_elements.append(element['elements'])
-            else :
-             if "elements" in element:
-                for nested_element in element["elements"]:
-                    if nested_element.get("element_id") in unique_ids:
-                        cited_elements.append(nested_element)
         # Convert to JSON
-        cited_elements_json = json.dumps(cited_elements, ensure_ascii=False, indent=2)
         return {
-            "response": response_text,
             "cited_elements_json": cited_elements_json,
-            "unique_ids": unique_ids
         }
     except Exception as e:
         print(f"Error processing question: {e}")
         return {
-            "response": response_text,
             "cited_elements_json": "[]",
-            "unique_ids": []
-        }

 # -*- coding: utf-8 -*-
 """
+Enhanced RAG Pipeline for vaccine assistant
+Handles agent creation and question answering with sequential citation numbering
 """
 import json
         ids = [id_str.strip() for id_str in citation.split(',')]
         all_ids.extend(ids)
+    # Get unique source IDs while preserving order
+    seen = set()
+    source_ids = []
+    for id_str in all_ids:
+        if id_str not in seen:
+            seen.add(id_str)
+            source_ids.append(id_str)
     if not source_ids:
         print("Warning: No valid source IDs found after filtering.")
     return source_ids
+def convert_citations_to_sequential(response_text, source_id_to_number_map):
+    """
+    Convert source IDs in response text to sequential numbers.
+    Args:
+        response_text (str): The response text with source ID citations
+        source_id_to_number_map (dict): Mapping from source IDs to sequential numbers
+    Returns:
+        str: Response text with sequential number citations
+    """
+    def replace_citation(match):
+        citation_content = match.group(1)
+        # Handle multiple IDs in one citation (comma-separated)
+        ids = [id_str.strip() for id_str in citation_content.split(',')]
+        # Convert each ID to its sequential number
+        numbers = []
+        for id_str in ids:
+            if id_str in source_id_to_number_map:
+                numbers.append(str(source_id_to_number_map[id_str]))
+        # Return the formatted citation with sequential numbers
+        if len(numbers) == 1:
+            return f"[{numbers[0]}]"
+        elif len(numbers) > 1:
+            return f"[{','.join(numbers)}]"
+        else:
+            return match.group(0)  # Return original if no mapping found
+    # Replace all citations in the text
+    sequential_response = re.sub(r'\[([^\[\]]+)\]', replace_citation, response_text)
+    return sequential_response
 def create_custom_prompt():
     """Create custom prompt with medical assistant instructions"""
         print(f"Error processing question: {e}")
         return f"Error processing your question: {str(e)}"
+def process_question_with_sequential_citations(agent, question: str, chunks_directory="./data/") -> dict:
     """
+    Process a question through the RAG pipeline and return response with sequential citation numbers.
     Args:
         agent: The initialized RAG agent
     Returns:
         dict: {
+            "response": str,  # Response with sequential citation numbers [1], [2], etc.
+            "cited_elements_json": str,  # JSON array of cited elements in order
+            "unique_ids": list,  # Original source IDs in order
+            "citation_mapping": dict  # Mapping from source ID to citation number
         }
     """
     try:
         response = agent.chat(question)
         response_text = response.response
+        # Extract source IDs from the response (preserving order)
         unique_ids = extract_source_ids(response_text)
+        # Create mapping from source ID to sequential number
+        source_id_to_number = {source_id: i + 1 for i, source_id in enumerate(unique_ids)}
+        # Convert citations to sequential numbers
+        sequential_response = convert_citations_to_sequential(response_text, source_id_to_number)
         # Load all chunks data to find cited elements
         all_chunks_data = []
+        min_chunks_files = ["Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.json",
+                           "Immunization_in_Practice_WHO_eng_2015.json"]
         for json_file in min_chunks_files:
+            json_path = os.path.join(chunks_directory, json_file)
+            try:
+                with open(json_path, "r", encoding="utf-8") as f:
+                    chunks_data = json.load(f)
+                    all_chunks_data.extend(chunks_data)
+            except Exception as e:
+                print(f"Warning: Could not load {json_file}: {e}")
+        # Get cited elements in the same order as the sequential citations
+        cited_elements_ordered = []
+        for source_id in unique_ids:  # This preserves the order
+            for element in all_chunks_data:
+                if element.get("type") == 'TableElement':
+                    if element.get("element_id") == source_id:
+                        cited_elements_ordered.append(element)
+                        break
+                else:
+                    if "elements" in element:
+                        for nested_element in element["elements"]:
+                            if nested_element.get("element_id") == source_id:
+                                cited_elements_ordered.append(nested_element)
+                                break
+                        else:
+                            continue
+                        break
         # Convert to JSON
+        cited_elements_json = json.dumps(cited_elements_ordered, ensure_ascii=False, indent=2)
         return {
+            "response": sequential_response,
             "cited_elements_json": cited_elements_json,
+            "unique_ids": unique_ids,
+            "citation_mapping": source_id_to_number
         }
     except Exception as e:
         print(f"Error processing question: {e}")
         return {
+            "response": response_text if 'response_text' in locals() else "Error occurred",
             "cited_elements_json": "[]",
+            "unique_ids": [],
+            "citation_mapping": {}
+        }
+def process_question_with_citations(agent, question: str, chunks_directory="./data/") -> dict:
+    """
+    Legacy function - maintained for backward compatibility.
+    Now calls the new sequential citation function.
+    """
+    return process_question_with_sequential_citations(agent, question, chunks_directory)