Spaces:

jfang
/

gprmax-support-gsoc25

Runtime error

App Files Files Community

jfang commited on Aug 29, 2025

Commit

3718631

verified ·

1 Parent(s): 8a7f087

Upload 7 files

Browse files

Files changed (7) hide show

app.py +342 -26
rag-db/README.md +208 -0
rag-db/__init__.py +0 -0
rag-db/generate_db.py +261 -0
rag-db/requirements.txt +7 -0
rag-db/retriever.py +257 -0
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -3,7 +3,13 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
 import re
-from typing import List, Dict, Tuple
 # Initialize model and tokenizer
@@ -21,6 +27,51 @@ model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 )
 @spaces.GPU(duration=60)
 def generate_response_stream(
@@ -191,6 +242,84 @@ def generate_response_stream(
     thread.join()
 def respond(
     message: str,
     history: List[Dict[str, str]],
@@ -200,39 +329,186 @@ def respond(
     top_p: float,
 ):
     """
-    Response function for custom Gradio interface with separate thinking display.
     """
-    thinking_content = ""
-    response_content = ""
     try:
-        # Stream tokens from the model
         for thinking, response in generate_response_stream(
             message=message,
             history=history,
-            system_message=system_message,
             max_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
         ):
-            thinking_content = thinking
-            response_content = response
-            # Yield both thinking and response content
-            yield thinking_content, response_content
     except Exception as e:
         error_message = f"❌ Error generating response: {str(e)}"
-        yield "", error_message
 # Default system prompt for gprMax assistance
-DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant specialized in gprMax, an open-source software that simulates electromagnetic wave propagation. You help users with:
 1. Creating gprMax input files (.in files)
 2. Understanding gprMax commands and syntax
 3. Setting up simulations for GPR (Ground Penetrating Radar) and other EM applications
 4. Troubleshooting simulation issues
 5. Optimizing simulation parameters
 If you give code blocks, ensure to enclose them inside ```.
 There is no need to always give full input codes, be sure to understand what user needs and intends to do. Some times a simple line of code can do, sometimes user wants explanation rather than codes.
@@ -297,13 +573,21 @@ with gr.Blocks(title="gprMax Support", theme=gr.themes.Ocean()) as demo:
                 thinking_display = gr.Markdown(
                     value="*Thinking process will appear here when the AI is reasoning through your question...*",
                     label="Thinking",
-                    height=400,
                 )
             # Settings
             with gr.Accordion("⚙️ Settings", open=True):
                 system_message = gr.Textbox(
-                    value=DEFAULT_SYSTEM_PROMPT,
                     label="System Message",
                     lines=5,
                     info="Customize the assistant's behavior"
@@ -345,7 +629,7 @@ with gr.Blocks(title="gprMax Support", theme=gr.themes.Ocean()) as demo:
     def bot_respond(history, system_msg, max_tok, temp, top_p_val):
         if not history or history[-1]["role"] != "user":
-            yield history, "*No thinking process*"
             return
         user_message = history[-1]["content"]
@@ -355,10 +639,12 @@ with gr.Blocks(title="gprMax Support", theme=gr.themes.Ocean()) as demo:
         history = history + [{"role": "assistant", "content": ""}]
         thinking_text = ""
         is_thinking = False
         has_main_content = False
-        for thinking, response in respond(
             user_message,
             history_for_model,
             system_msg,
@@ -368,46 +654,76 @@ with gr.Blocks(title="gprMax Support", theme=gr.themes.Ocean()) as demo:
         ):
             # Update thinking display
             if thinking:
-                thinking_text = f"## Reasoning Process\n\n{thinking}"
-                is_thinking = True
-            else:
                 thinking_text = "*Waiting for response...*"
             # Update chat response
             if response and response.strip():
                 # We have actual response content
-                history[-1]["content"] = response
-                has_main_content = True
             elif is_thinking and not has_main_content:
                 # Still thinking, no main response yet
                 history[-1]["content"] = "🤔 *AI is thinking... Check the right pane for thinking details*"
             elif not response:
                 # No response yet and no thinking detected
                 history[-1]["content"] = "⏳ *Generating response...*"
-            yield history, thinking_text
     # Event handlers
     msg.submit(user_submit, [msg, chatbot], [msg, chatbot]).then(
         bot_respond,
         [chatbot, system_message, max_tokens, temperature, top_p],
-        [chatbot, thinking_display]
     )
     submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot]).then(
         bot_respond,
         [chatbot, system_message, max_tokens, temperature, top_p],
-        [chatbot, thinking_display]
     )
-    clear_btn.click(lambda: ([], "*Thinking process will appear here when the AI is reasoning through your question...*"), outputs=[chatbot, thinking_display])
     gr.Markdown(
-        """
         ---
         ### About
         This assistant uses `jfang/gprmax-ft-Qwen3-4B-Instruct`, a model fine-tuned specifically for gprMax support.
         **Note**: For best results, be specific about your gprMax version and simulation requirements.
         """
     )

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
 import re
+from typing import List, Dict, Tuple, Optional
+import sys
+from pathlib import Path
+# Add rag-db to path for imports
+sys.path.append(str(Path(__file__).parent / "rag-db"))
+from retriever import create_retriever, GprMaxRAGRetriever
 # Initialize model and tokenizer
     trust_remote_code=True
 )
+# Initialize RAG retriever
+RAG_DB_PATH = Path(__file__).parent / "rag-db" / "chroma_db"
+retriever: Optional[GprMaxRAGRetriever] = None
+def generate_database_if_needed():
+    """Generate the RAG database if it doesn't exist"""
+    if not RAG_DB_PATH.exists():
+        print("=" * 60)
+        print("RAG database not found. Generating database...")
+        print("This is a one-time process and may take a few minutes.")
+        print("=" * 60)
+        import subprocess
+        try:
+            # Run the generation script
+            result = subprocess.run(
+                ["python", str(Path(__file__).parent / "rag-db" / "generate_db.py")],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            print(result.stdout)
+            print("✅ Database generated successfully!")
+            return True
+        except subprocess.CalledProcessError as e:
+            print(f"❌ Failed to generate database: {e}")
+            if e.stderr:
+                print(f"Error output: {e.stderr}")
+            return False
+    return True
+# Generate database if needed and load retriever
+if generate_database_if_needed():
+    try:
+        print(f"Loading RAG database from {RAG_DB_PATH}")
+        retriever = create_retriever(db_path=RAG_DB_PATH)
+        print("RAG database loaded successfully")
+    except Exception as e:
+        print(f"Error loading RAG database: {e}")
+        print("RAG features will be disabled.")
+        retriever = None
+else:
+    print("RAG features will be disabled due to database generation failure.")
+    retriever = None
 @spaces.GPU(duration=60)
 def generate_response_stream(
     thread.join()
+# Tool definitions in Qwen3 format
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "search_documentation",
+            "description": "Search gprMax documentation for relevant information about commands, syntax, parameters, or usage",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {
+                        "type": "string",
+                        "description": "The search query to find relevant documentation"
+                    },
+                    "num_results": {
+                        "type": "integer",
+                        "description": "Number of results to return",
+                        "default": 10
+                    }
+                },
+                "required": ["query"]
+            }
+        }
+    }
+]
+def format_tools_prompt() -> str:
+    """Format tools for inclusion in system prompt"""
+    import json
+    return json.dumps(TOOLS, indent=2)
+def perform_rag_search(query: str, k: int = 10) -> Tuple[str, List[Dict]]:
+    """
+    Perform RAG search and return formatted context and sources
+    Returns:
+        Tuple of (context_for_llm, source_list_for_display)
+    """
+    if not retriever:
+        print(f"[DEBUG] Retriever is None!")
+        return "", []
+    try:
+        print(f"[DEBUG] Searching for: '{query}' with k={k}")
+        # Search for relevant documents
+        results = retriever.search(query, k=k)
+        print(f"[DEBUG] Search returned {len(results) if results else 0} results")
+        if not results:
+            return "", []
+        # Format context for LLM - pass all text content
+        context_parts = []
+        source_list = []
+        for i, result in enumerate(results, 1):
+            # Add full text to context for LLM (up to 1000 chars per doc)
+            context_parts.append(f"[Document {i}]: {result.text}")
+            # Add to source list for display (limited preview)
+            source_list.append({
+                "index": i,
+                "source": result.metadata.get("source", "Unknown"),
+                "score": result.score,
+                "preview": result.text[:150] + "..." if len(result.text) > 150 else result.text
+            })
+        context = "\n\n".join(context_parts)
+        return context, source_list
+    except Exception as e:
+        print(f"[DEBUG] RAG search error: {e}")
+        import traceback
+        traceback.print_exc()
+        return "", []
 def respond(
     message: str,
     history: List[Dict[str, str]],
     top_p: float,
 ):
     """
+    Response function with proper Qwen3 tool calling
     """
+    import json
+    import re
+    sources_content = ""
     try:
+        # Use system message as-is (already has tools included)
+        system_with_tools = system_message
+        # First, get initial response from model to see if it wants to use tools
+        tool_call = None
+        accumulated_response = ""
+        final_thinking = ""
+        is_complete = False
+        # Collect the full response (thinking + potential tool call)
         for thinking, response in generate_response_stream(
             message=message,
             history=history,
+            system_message=system_with_tools,
             max_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
         ):
+            final_thinking = thinking if thinking else final_thinking
+            accumulated_response = response
+            # Show thinking progress only
+            if thinking:
+                yield thinking, "⏳ *AI is analyzing your request...*", sources_content
+        # After streaming completes, check what we got
+        if accumulated_response and accumulated_response.strip():
+            # Check if the complete response is a JSON tool call
+            if accumulated_response.strip().startswith('{'):
+                try:
+                    # Try to parse the entire response as JSON
+                    response_json = json.loads(accumulated_response.strip())
+                    if "tool_call" in response_json or ("thought" in response_json and "tool_call" in response_json):
+                        tool_call = response_json.get("tool_call") or response_json["tool_call"]
+                        # Show status that we're processing the tool call
+                        yield final_thinking, "🔍 *Processing documentation search request...*", sources_content
+                        is_complete = True
+                except json.JSONDecodeError:
+                    # Invalid JSON, treat as normal response
+                    yield final_thinking, accumulated_response, sources_content
+                    is_complete = True
+                except Exception:
+                    yield final_thinking, accumulated_response, sources_content
+                    is_complete = True
+            else:
+                # It's a normal text response, not a tool call
+                yield final_thinking, accumulated_response, sources_content
+                is_complete = True
+        # If tool was called, execute it
+        if tool_call and retriever:
+            tool_name = tool_call.get("name")
+            print(f"[DEBUG] Tool called: {tool_name}")
+            print(f"[DEBUG] Tool call details: {tool_call}")
+            if tool_name == "search_documentation":
+                # Update status
+                yield "🔍 *Searching documentation...*", "⏳ *Preparing to search...*", "📚 *Retrieving relevant documents...*"
+                # Get search query
+                query = tool_call.get("arguments", {}).get("query", message)
+                num_results = tool_call.get("arguments", {}).get("num_results", 10)
+                print(f"[DEBUG] Query extracted: '{query}', num_results: {num_results}")
+                # Perform search
+                context, sources_list = perform_rag_search(query, k=num_results)
+                print(f"[DEBUG] Search results - Context length: {len(context)}, Sources: {len(sources_list)}")
+                if context:
+                    # Format sources for display
+                    if sources_list:
+                        sources_parts = ["## 📚 Documentation Sources\n"]
+                        for source in sources_list:
+                            sources_parts.append(
+                                f"**[{source['index']}] {source['source']}** (Score: {source['score']:.3f})\n"
+                                f"```\n{source['preview']}\n```\n"
+                            )
+                        sources_content = "\n".join(sources_parts)
+                    else:
+                        sources_content = "*No relevant documentation found*"
+                    yield "✅ *Documentation retrieved*", "⏳ *Generating response with context...*", sources_content
+                    # Now generate response with the retrieved context
+                    augmented_message = f"""Tool call result for search_documentation:
+{context}
+Original question: {message}
+Please provide a comprehensive answer based on the documentation above."""
+                    # Generate final response with context
+                    for thinking, response in generate_response_stream(
+                        message=augmented_message,
+                        history=history,
+                        system_message=system_message,  # Use original system message for final response
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                    ):
+                        yield thinking, response, sources_content
+                else:
+                    sources_content = "*No relevant documentation found*"
+                    yield final_thinking, "⚠️ *Unable to retrieve documentation. Providing general answer...*", sources_content
+                    # Generate response without documentation context
+                    fallback_message = f"""The user asked about: {message}
+No relevant documentation was found in the database. Please provide a helpful answer based on your general knowledge of gprMax."""
+                    for thinking, response in generate_response_stream(
+                        message=fallback_message,
+                        history=history,
+                        system_message=system_message,
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                    ):
+                        yield thinking, response, sources_content
+        # If tool was called but retriever is not available
+        elif tool_call and not retriever:
+            yield final_thinking, "⚠️ *Documentation search is not available. Providing answer based on general knowledge...*", ""
+            # Generate response without RAG
+            for thinking, response in generate_response_stream(
+                message=message,
+                history=history,
+                system_message=system_message,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+            ):
+                yield thinking, response, ""
+        # If no tool call and response wasn't already yielded
+        elif not tool_call and not is_complete:
+            # This shouldn't happen but handle it just in case
+            if accumulated_response and not accumulated_response.strip().startswith('{'):
+                yield final_thinking, accumulated_response, sources_content
     except Exception as e:
         error_message = f"❌ Error generating response: {str(e)}"
+        yield "", error_message, ""
 # Default system prompt for gprMax assistance
+def get_default_system_prompt():
+    """Get system prompt with tools formatted"""
+    tools_json = format_tools_prompt()
+    return f"""You are a helpful assistant specialized in gprMax, an open-source software that simulates electromagnetic wave propagation. You help users with:
 1. Creating gprMax input files (.in files)
 2. Understanding gprMax commands and syntax
 3. Setting up simulations for GPR (Ground Penetrating Radar) and other EM applications
 4. Troubleshooting simulation issues
 5. Optimizing simulation parameters
+You have access to the following tools:
+{tools_json}
+When you need to search documentation, respond with a tool call in this JSON format:
+{{
+  "thought": "I need to search the documentation for...",
+  "tool_call": {{
+    "name": "search_documentation",
+    "arguments": {{
+      "query": "your search query here"
+    }}
+  }}
+}}
+After receiving tool results, provide a comprehensive answer based on the documentation.
 If you give code blocks, ensure to enclose them inside ```.
 There is no need to always give full input codes, be sure to understand what user needs and intends to do. Some times a simple line of code can do, sometimes user wants explanation rather than codes.
                 thinking_display = gr.Markdown(
                     value="*Thinking process will appear here when the AI is reasoning through your question...*",
                     label="Thinking",
+                    height=300,
+                )
+            # Documentation sources in collapsible accordion
+            with gr.Accordion("📚 Documentation Sources", open=False) as sources_accordion:
+                sources_display = gr.Markdown(
+                    value="*Documentation sources will appear here when RAG search is performed...*",
+                    label="Sources",
+                    height=300,
                 )
             # Settings
             with gr.Accordion("⚙️ Settings", open=True):
                 system_message = gr.Textbox(
+                    value=get_default_system_prompt(),
                     label="System Message",
                     lines=5,
                     info="Customize the assistant's behavior"
     def bot_respond(history, system_msg, max_tok, temp, top_p_val):
         if not history or history[-1]["role"] != "user":
+            yield history, "*No thinking process*", "*No sources*"
             return
         user_message = history[-1]["content"]
         history = history + [{"role": "assistant", "content": ""}]
         thinking_text = ""
+        sources_text = ""
         is_thinking = False
         has_main_content = False
+        is_searching = False
+        for thinking, response, sources in respond(
             user_message,
             history_for_model,
             system_msg,
         ):
             # Update thinking display
             if thinking:
+                if "Searching documentation" in thinking:
+                    thinking_text = thinking
+                    is_searching = True
+                elif "Documentation retrieved" in thinking:
+                    thinking_text = thinking
+                    is_searching = False
+                else:
+                    thinking_text = f"## Reasoning Process\n\n{thinking}"
+                    is_thinking = True
+            elif not thinking and not is_searching:
                 thinking_text = "*Waiting for response...*"
+            # Update sources display
+            if sources:
+                sources_text = sources
             # Update chat response
             if response and response.strip():
                 # We have actual response content
+                if "Preparing to search" in response or "Generating response" in response:
+                    # Status messages
+                    history[-1]["content"] = response
+                else:
+                    # Actual content
+                    history[-1]["content"] = response
+                    has_main_content = True
             elif is_thinking and not has_main_content:
                 # Still thinking, no main response yet
                 history[-1]["content"] = "🤔 *AI is thinking... Check the right pane for thinking details*"
+            elif is_searching:
+                history[-1]["content"] = "🔍 *Searching documentation...*"
             elif not response:
                 # No response yet and no thinking detected
                 history[-1]["content"] = "⏳ *Generating response...*"
+            yield history, thinking_text, sources_text
     # Event handlers
     msg.submit(user_submit, [msg, chatbot], [msg, chatbot]).then(
         bot_respond,
         [chatbot, system_message, max_tokens, temperature, top_p],
+        [chatbot, thinking_display, sources_display]
     )
     submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot]).then(
         bot_respond,
         [chatbot, system_message, max_tokens, temperature, top_p],
+        [chatbot, thinking_display, sources_display]
+    )
+    clear_btn.click(
+        lambda: (
+            [],
+            "*Thinking process will appear here when the AI is reasoning through your question...*",
+            "*Documentation sources will appear here when RAG search is performed...*"
+        ),
+        outputs=[chatbot, thinking_display, sources_display]
     )
+    # RAG status indicator
+    rag_status = "✅ Documentation search enabled" if retriever else "⚠️ Documentation search disabled (run generate_db.py)"
     gr.Markdown(
+        f"""
         ---
         ### About
         This assistant uses `jfang/gprmax-ft-Qwen3-4B-Instruct`, a model fine-tuned specifically for gprMax support.
+        **RAG Status**: {rag_status}
         **Note**: For best results, be specific about your gprMax version and simulation requirements.
         """
     )

rag-db/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+# gprMax RAG Database System
+## Overview
+This is a production-ready Retrieval-Augmented Generation (RAG) system for gprMax documentation. It provides efficient vector search capabilities for the gprMax documentation, enabling intelligent context retrieval for the chatbot.
+## Architecture
+### Components
+1. **Document Processor**: Extracts and chunks documentation from gprMax GitHub repository
+2. **Embedding Model**: Qwen2.5-0.5B (will upgrade to Qwen3-Embedding-0.6B when available)
+3. **Vector Database**: ChromaDB with persistent storage
+4. **Retriever**: Search and context retrieval utilities
+### Key Features
+- Automatic documentation extraction from gprMax GitHub repository
+- Intelligent chunking with configurable size and overlap
+- Persistent vector database using ChromaDB
+- Efficient similarity search with score thresholding
+- Metadata tracking for reproducibility
+## Installation
+The database is **automatically generated** on first startup of the application. No manual installation required!
+## Automatic Generation
+When the app starts:
+1. Checks if database exists at `rag-db/chroma_db/`
+2. If not found, automatically runs `generate_db.py`
+3. Clones gprMax repository and processes documentation
+4. Creates ChromaDB with default embeddings (all-MiniLM-L6-v2)
+5. Ready to use - this only happens once!
+## Manual Generation (Optional)
+If you need to manually regenerate the database:
+```bash
+cd rag-db
+python generate_db.py --recreate
+```
+Custom settings:
+```bash
+python generate_db.py \
+    --db-path ./custom_db \
+    --temp-dir ./temp \
+    --device cuda \
+    --recreate
+```
+### 2. Use Retriever in Application
+```python
+from rag_db.retriever import create_retriever
+# Initialize retriever
+retriever = create_retriever(db_path="./rag-db/chroma_db")
+# Search for relevant documents
+results = retriever.search("How to create a source?", k=5)
+# Get formatted context for LLM
+context = retriever.get_context("antenna patterns", k=3)
+# Get relevant source files
+files = retriever.get_relevant_files("boundary conditions")
+# Get database statistics
+stats = retriever.get_stats()
+```
+### 3. Test Retriever
+```bash
+# Test with default query
+python retriever.py
+# Test with custom query
+python retriever.py "How to model soil layers?"
+```
+## Database Schema
+### Document Structure
+```json
+{
+    "id": "unique_hash",
+    "text": "document_chunk_text",
+    "metadata": {
+        "source": "docs/relative/path.rst",
+        "file_type": ".rst",
+        "chunk_index": 0,
+        "char_start": 0,
+        "char_end": 1000
+    }
+}
+```
+### Metadata File
+Generated `metadata.json` contains:
+```json
+{
+    "created_at": "2024-01-01T00:00:00",
+    "embedding_model": "Qwen/Qwen2.5-0.5B",
+    "collection_name": "gprmax_docs_v1",
+    "chunk_size": 1000,
+    "chunk_overlap": 200,
+    "total_documents": 1234
+}
+```
+## Configuration
+### Chunking Parameters
+- `CHUNK_SIZE`: 1000 characters (optimal for context windows)
+- `CHUNK_OVERLAP`: 200 characters (ensures continuity)
+### Embedding Model
+- Current: `Qwen/Qwen2.5-0.5B` (512-dim embeddings)
+- Future: `Qwen/Qwen3-Embedding-0.6B` (when available)
+### Database Settings
+- Storage: ChromaDB persistent client
+- Collection: `gprmax_docs_v1` (versioned for updates)
+- Distance Metric: Cosine similarity
+## Maintenance
+### Regular Updates
+Run monthly or when gprMax documentation updates:
+```bash
+# This will pull latest docs and update database
+python generate_db.py
+```
+### Database Backup
+```bash
+# Backup database
+cp -r chroma_db chroma_db_backup_$(date +%Y%m%d)
+```
+### Performance Tuning
+- Adjust `CHUNK_SIZE` and `CHUNK_OVERLAP` in `generate_db.py`
+- Modify batch sizes for large datasets
+- Use GPU acceleration with `--device cuda`
+## Integration with Main App
+The RAG system integrates with the main Gradio app:
+1. Import retriever in `app.py`
+2. Use retriever to augment prompts with context
+3. Display source references in UI
+Example integration:
+```python
+# In app.py
+from rag_db.retriever import create_retriever
+retriever = create_retriever()
+def augment_with_context(user_query):
+    context = retriever.get_context(user_query, k=3)
+    augmented_prompt = f"""
+    Context from documentation:
+    {context}
+    User question: {user_query}
+    """
+    return augmented_prompt
+```
+## Troubleshooting
+### Common Issues
+1. **Database not found**
+   - Run `python generate_db.py` first
+   - Check `--db-path` parameter
+2. **Out of memory**
+   - Use smaller batch sizes
+   - Use CPU instead of GPU
+   - Reduce chunk size
+3. **Slow generation**
+   - Use GPU with `--device cuda`
+   - Reduce repository depth with shallow clone
+   - Use pre-generated database
+### Logs
+Check generation logs for detailed information:
+```bash
+python generate_db.py 2>&1 | tee generation.log
+```
+## Future Enhancements
+1. **Model Upgrade**: Migrate to Qwen3-Embedding-0.6B when available
+2. **Incremental Updates**: Add documents without full regeneration
+3. **Multi-modal Support**: Include images and diagrams from docs
+4. **Query Expansion**: Automatic query reformulation for better retrieval
+5. **Caching Layer**: Redis cache for frequent queries
+6. **Fine-tuned Embeddings**: Domain-specific embedding model for gprMax
+## License
+Same as parent project

rag-db/__init__.py ADDED Viewed

File without changes

rag-db/generate_db.py ADDED Viewed

	@@ -0,0 +1,261 @@

+#!/usr/bin/env python3
+"""
+RAG Database Generation Script for gprMax Documentation
+Generates a ChromaDB vector database from gprMax documentation
+"""
+import os
+import sys
+import shutil
+import argparse
+import logging
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict, Any
+import json
+import hashlib
+import chromadb
+import git
+from tqdm import tqdm
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class GprMaxDocumentProcessor:
+    """Process gprMax documentation files for vectorization"""
+    SUPPORTED_EXTENSIONS = {'.rst', '.md', '.txt'}
+    CHUNK_SIZE = 1000  # Characters per chunk
+    CHUNK_OVERLAP = 200  # Overlap between chunks
+    def __init__(self, repo_path: Path):
+        self.repo_path = repo_path
+        self.doc_path = repo_path / "docs"
+    def extract_documents(self) -> List[Dict[str, Any]]:
+        """Extract and chunk all documentation files"""
+        documents = []
+        if not self.doc_path.exists():
+            logger.warning(f"Documentation path {self.doc_path} does not exist")
+            return documents
+        for file_path in self._find_doc_files():
+            try:
+                chunks = self._process_file(file_path)
+                documents.extend(chunks)
+            except Exception as e:
+                logger.error(f"Error processing {file_path}: {e}")
+        logger.info(f"Extracted {len(documents)} document chunks")
+        return documents
+    def _find_doc_files(self) -> List[Path]:
+        """Find all documentation files"""
+        doc_files = []
+        for ext in self.SUPPORTED_EXTENSIONS:
+            doc_files.extend(self.doc_path.rglob(f"*{ext}"))
+        return doc_files
+    def _process_file(self, file_path: Path) -> List[Dict[str, Any]]:
+        """Process a single file into chunks"""
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            content = f.read()
+        # Calculate relative path for metadata
+        rel_path = file_path.relative_to(self.repo_path)
+        # Create chunks with overlap
+        chunks = []
+        for i in range(0, len(content), self.CHUNK_SIZE - self.CHUNK_OVERLAP):
+            chunk_text = content[i:i + self.CHUNK_SIZE]
+            # Skip empty or very small chunks
+            if len(chunk_text.strip()) < 50:
+                continue
+            # Generate unique ID for chunk
+            chunk_id = hashlib.md5(f"{rel_path}_{i}_{chunk_text[:50]}".encode()).hexdigest()
+            chunks.append({
+                "id": chunk_id,
+                "text": chunk_text,
+                "metadata": {
+                    "source": str(rel_path),
+                    "file_type": file_path.suffix,
+                    "chunk_index": len(chunks),
+                    "char_start": i,
+                    "char_end": min(i + self.CHUNK_SIZE, len(content))
+                }
+            })
+        return chunks
+# Removed custom embedding model - using ChromaDB's default
+class ChromaRAGDatabase:
+    """ChromaDB-based RAG database"""
+    def __init__(self, db_path: Path):
+        self.db_path = db_path
+        # Initialize ChromaDB with persistent storage
+        self.client = chromadb.PersistentClient(path=str(db_path))
+        # Collection name with version for easy updates
+        self.collection_name = "gprmax_docs_v1"
+    def create_collection(self, recreate: bool = False):
+        """Create or get the document collection"""
+        if recreate:
+            try:
+                self.client.delete_collection(self.collection_name)
+                logger.info(f"Deleted existing collection: {self.collection_name}")
+            except:
+                pass
+        # Let ChromaDB use its default embedding function
+        self.collection = self.client.create_collection(
+            name=self.collection_name,
+            metadata={"created_at": datetime.now().isoformat()}
+        )
+        logger.info(f"Created collection: {self.collection_name}")
+    def add_documents(self, documents: List[Dict[str, Any]]):
+        """Add documents to the collection"""
+        if not documents:
+            logger.warning("No documents to add")
+            return
+        # Prepare data for ChromaDB
+        ids = [doc["id"] for doc in documents]
+        texts = [doc["text"] for doc in documents]
+        metadatas = [doc["metadata"] for doc in documents]
+        # Add to collection in batches (ChromaDB will generate embeddings automatically)
+        batch_size = 100
+        logger.info(f"Adding {len(documents)} documents to database...")
+        for i in tqdm(range(0, len(ids), batch_size), desc="Adding to database"):
+            end_idx = min(i + batch_size, len(ids))
+            self.collection.add(
+                ids=ids[i:end_idx],
+                documents=texts[i:end_idx],
+                metadatas=metadatas[i:end_idx]
+                # No embeddings parameter - ChromaDB will generate them
+            )
+        logger.info(f"Added {len(documents)} documents to database")
+        # Verify documents were added
+        actual_count = self.collection.count()
+        logger.info(f"Verified collection now contains {actual_count} documents")
+    def save_metadata(self):
+        """Save database metadata for reference"""
+        # Get fresh count
+        doc_count = self.collection.count()
+        metadata = {
+            "created_at": datetime.now().isoformat(),
+            "embedding_model": "ChromaDB Default (all-MiniLM-L6-v2)",
+            "collection_name": self.collection_name,
+            "chunk_size": GprMaxDocumentProcessor.CHUNK_SIZE,
+            "chunk_overlap": GprMaxDocumentProcessor.CHUNK_OVERLAP,
+            "total_documents": doc_count
+        }
+        metadata_path = self.db_path / "metadata.json"
+        with open(metadata_path, 'w') as f:
+            json.dump(metadata, f, indent=2)
+        logger.info(f"Saved metadata to {metadata_path}")
+def clone_gprmax_repo(target_dir: Path) -> Path:
+    """Clone or update gprMax repository"""
+    repo_path = target_dir / "gprMax"
+    if repo_path.exists():
+        logger.info(f"Updating existing repository at {repo_path}")
+        repo = git.Repo(repo_path)
+        repo.remotes.origin.pull()
+    else:
+        logger.info(f"Cloning gprMax repository to {repo_path}")
+        git.Repo.clone_from(
+            "https://github.com/gprMax/gprMax.git",
+            repo_path,
+            depth=1  # Shallow clone for faster download
+        )
+    return repo_path
+def main():
+    parser = argparse.ArgumentParser(description="Generate RAG database from gprMax documentation")
+    parser.add_argument(
+        "--db-path",
+        type=Path,
+        default=Path(__file__).parent / "chroma_db",
+        help="Path to store the ChromaDB database"
+    )
+    parser.add_argument(
+        "--temp-dir",
+        type=Path,
+        default=Path(__file__).parent / "temp",
+        help="Temporary directory for cloning repository"
+    )
+    parser.add_argument(
+        "--recreate",
+        action="store_true",
+        help="Recreate database from scratch (delete existing)"
+    )
+    args = parser.parse_args()
+    try:
+        # Step 1: Clone/update gprMax repository
+        logger.info("Step 1: Fetching gprMax repository...")
+        repo_path = clone_gprmax_repo(args.temp_dir)
+        # Step 2: Process documentation
+        logger.info("Step 2: Processing documentation files...")
+        processor = GprMaxDocumentProcessor(repo_path)
+        documents = processor.extract_documents()
+        if not documents:
+            logger.error("No documents found to process")
+            return 1
+        # Step 3: Create database
+        logger.info("Step 3: Creating vector database...")
+        db = ChromaRAGDatabase(args.db_path)
+        db.create_collection(recreate=args.recreate)
+        # Step 4: Add documents
+        logger.info("Step 4: Adding documents to database...")
+        db.add_documents(documents)
+        # Step 5: Save metadata
+        db.save_metadata()
+        logger.info(f"✅ Database successfully created at {args.db_path}")
+        logger.info(f"Total documents: {len(documents)}")
+        # Cleanup temp files if needed
+        if args.temp_dir.exists() and args.temp_dir != args.db_path.parent:
+            logger.info("Cleaning up temporary files...")
+            shutil.rmtree(args.temp_dir, ignore_errors=True)
+        return 0
+    except Exception as e:
+        logger.error(f"Failed to generate database: {e}")
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())

rag-db/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+# RAG Database Requirements
+chromadb>=0.4.22
+GitPython>=3.1.40
+tqdm>=4.66.1
+torch>=2.0.0
+transformers>=4.44.0
+sentencepiece

rag-db/retriever.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+RAG Retrieval Utilities for gprMax Documentation
+Provides search and retrieval functions for the vector database
+"""
+import logging
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+import json
+import chromadb
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+@dataclass
+class SearchResult:
+    """Container for search results"""
+    text: str
+    score: float
+    metadata: Dict[str, Any]
+    def __str__(self) -> str:
+        return f"[Score: {self.score:.3f}] {self.metadata.get('source', 'Unknown')}: {self.text[:100]}..."
+# Removed QwenEmbeddingModel class - using ChromaDB's default embedding
+class GprMaxRAGRetriever:
+    """Retriever for gprMax documentation RAG database"""
+    def __init__(self, db_path: Path = None):
+        if db_path is None:
+            db_path = Path(__file__).parent / "chroma_db"
+        if not db_path.exists():
+            raise ValueError(f"Database path {db_path} does not exist. Run generate_db.py first.")
+        self.db_path = db_path
+        # Load metadata
+        metadata_path = db_path / "metadata.json"
+        if metadata_path.exists():
+            with open(metadata_path, 'r') as f:
+                self.metadata = json.load(f)
+        else:
+            self.metadata = {}
+        # Initialize ChromaDB client
+        self.client = chromadb.PersistentClient(path=str(db_path))
+        # Get collection
+        self.collection_name = self.metadata.get("collection_name", "gprmax_docs_v1")
+        try:
+            print(f"[RAG] Loading collection: {self.collection_name}")
+            self.collection = self.client.get_collection(self.collection_name)
+            doc_count = self.collection.count()
+            print(f"[RAG] Loaded collection: {self.collection_name} with {doc_count} documents")
+            logger.info(f"Loaded collection: {self.collection_name} with {doc_count} documents")
+        except Exception as e:
+            print(f"[RAG] ERROR loading collection: {e}")
+            raise ValueError(f"Failed to load collection {self.collection_name}: {e}")
+    def search(
+        self,
+        query: str,
+        k: int = 10,
+        threshold: float = 0.0,
+        filter_metadata: Optional[Dict[str, Any]] = None
+    ) -> List[SearchResult]:
+        """
+        Search for relevant documents
+        Args:
+            query: Search query text
+            k: Number of results to return
+            threshold: Minimum similarity score threshold
+            filter_metadata: Optional metadata filters
+        Returns:
+            List of SearchResult objects
+        """
+        # Search in ChromaDB (it will generate embeddings automatically)
+        try:
+            results = self.collection.query(
+                query_texts=[query],  # Use query_texts instead of query_embeddings
+                n_results=k,
+                where=filter_metadata if filter_metadata else None,
+                include=["documents", "metadatas", "distances"]
+            )
+            logger.info(f"ChromaDB query returned: {len(results.get('documents', [[]])[0]) if results.get('documents') else 0} results")
+        except Exception as e:
+            logger.error(f"ChromaDB query failed: {e}")
+            raise
+        # Convert to SearchResult objects
+        search_results = []
+        if results["documents"] and results["documents"][0]:
+            for doc, meta, dist in zip(
+                results["documents"][0],
+                results["metadatas"][0],
+                results["distances"][0]
+            ):
+                # Convert distance to similarity score (1 - normalized_distance)
+                score = 1.0 - (dist / 2.0)  # Assuming cosine distance in [-1, 1]
+                if score >= threshold:
+                    search_results.append(SearchResult(
+                        text=doc,
+                        score=score,
+                        metadata=meta
+                    ))
+        return search_results
+    def get_context(
+        self,
+        query: str,
+        k: int = 3,
+        max_context_length: int = 2000,
+        format_as_markdown: bool = True
+    ) -> str:
+        """
+        Get formatted context for a query
+        Args:
+            query: Search query
+            k: Number of documents to retrieve
+            max_context_length: Maximum total context length
+            format_as_markdown: Format output as markdown
+        Returns:
+            Formatted context string
+        """
+        results = self.search(query, k=k)
+        if not results:
+            return "No relevant documentation found."
+        context_parts = []
+        total_length = 0
+        for i, result in enumerate(results, 1):
+            if total_length >= max_context_length:
+                break
+            # Truncate if needed
+            text = result.text
+            if total_length + len(text) > max_context_length:
+                text = text[:max_context_length - total_length]
+            if format_as_markdown:
+                source = result.metadata.get("source", "Unknown")
+                context_parts.append(
+                    f"### Document {i} (Source: {source}, Score: {result.score:.3f})\n"
+                    f"```\n{text}\n```\n"
+                )
+            else:
+                context_parts.append(text)
+            total_length += len(text)
+        return "\n".join(context_parts)
+    def get_relevant_files(self, query: str, k: int = 5) -> List[str]:
+        """Get list of relevant source files for a query"""
+        results = self.search(query, k=k)
+        # Extract unique source files
+        sources = set()
+        for result in results:
+            source = result.metadata.get("source")
+            if source:
+                sources.add(source)
+        return sorted(list(sources))
+    def search_by_file(self, file_pattern: str, k: int = 10) -> List[SearchResult]:
+        """Search for documents from specific files"""
+        # This would need ChromaDB's where clause with pattern matching
+        # For now, we do a broad search and filter
+        results = self.collection.get(
+            limit=1000,  # Get many results
+            include=["documents", "metadatas"]
+        )
+        filtered_results = []
+        if results["documents"]:
+            for doc, meta in zip(results["documents"], results["metadatas"]):
+                source = meta.get("source", "")
+                if file_pattern.lower() in source.lower():
+                    filtered_results.append(SearchResult(
+                        text=doc,
+                        score=1.0,  # No score for direct retrieval
+                        metadata=meta
+                    ))
+                if len(filtered_results) >= k:
+                    break
+        return filtered_results
+    def get_stats(self) -> Dict[str, Any]:
+        """Get database statistics"""
+        stats = {
+            "total_documents": self.collection.count(),
+            "database_path": str(self.db_path),
+            "collection_name": self.collection_name,
+            "embedding_model": self.metadata.get("embedding_model", "Unknown"),
+            "created_at": self.metadata.get("created_at", "Unknown"),
+            "chunk_size": self.metadata.get("chunk_size", "Unknown"),
+            "chunk_overlap": self.metadata.get("chunk_overlap", "Unknown")
+        }
+        return stats
+def create_retriever(db_path: Optional[Path] = None) -> GprMaxRAGRetriever:
+    """Factory function to create a retriever instance"""
+    return GprMaxRAGRetriever(db_path=db_path)
+if __name__ == "__main__":
+    # Example usage
+    import sys
+    if len(sys.argv) > 1:
+        query = " ".join(sys.argv[1:])
+    else:
+        query = "How to create a source in gprMax?"
+    print(f"Testing retriever with query: '{query}'")
+    print("-" * 80)
+    try:
+        retriever = create_retriever()
+        # Get stats
+        stats = retriever.get_stats()
+        print(f"Database stats: {stats}")
+        print("-" * 80)
+        # Search
+        results = retriever.search(query, k=3)
+        print(f"Found {len(results)} results:")
+        for i, result in enumerate(results, 1):
+            print(f"\n{i}. {result}")
+        # Get formatted context
+        print("\n" + "=" * 80)
+        print("Formatted context:")
+        print(retriever.get_context(query, k=3))
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)

requirements.txt CHANGED Viewed

@@ -4,4 +4,7 @@ spaces
 accelerate
 sentencepiece
 einops
-numpy < 2.0.0

 accelerate
 sentencepiece
 einops
+numpy < 2.0.0
+chromadb
+GitPython
+tqdm