Spaces:

mhdakmal80
/

olist-text2sql

Runtime error

App Files Files Community

mhdakmal80 commited on Nov 25, 2025

Commit

d60cb1f

verified ·

1 Parent(s): 64a5793

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
README.md +61 -13
app_gradio.py +178 -0
database.py +205 -0
model_loader.py +199 -0
olist.sqlite +3 -0
requirements.txt +12 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+olist.sqlite filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,61 @@
----
-title: Olist Text2sql
-emoji: 🐢
-colorFrom: green
-colorTo: purple
-sdk: gradio
-sdk_version: 6.0.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Olist Text-to-SQL Agent
+emoji: 🤖
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app_gradio.py
+pinned: false
+license: mit
+---
+# 🤖 Olist Text-to-SQL Agent
+Convert natural language questions into SQL queries using a **fine-tuned Mistral-7B model**.
+## 🎯 Features
+- **Fine-Tuned Model**: Mistral-7B-Instruct-v0.2 fine-tuned with QLoRA on Olist e-commerce dataset
+- **Natural Language to SQL**: Ask questions in plain English, get executable SQL queries
+- **Real Database**: Query against actual Olist e-commerce data (100K+ orders)
+- **Interactive UI**: Built with Gradio for easy interaction
+## 🚀 How to Use
+1. Type your question in natural language
+2. Click "Generate SQL & Execute"
+3. View the generated SQL query and results
+## 💡 Example Questions
+- "How many orders are there?"
+- "What are the top 5 best-selling products?"
+- "Show total revenue by customer state"
+- "Which sellers have the highest ratings?"
+- "List all orders from São Paulo"
+## 🛠️ Tech Stack
+- **Model**: Mistral-7B-Instruct-v0.2 (fine-tuned with QLoRA)
+- **Frontend**: Gradio
+- **Database**: SQLite (Olist e-commerce dataset)
+- **ML Libraries**: PyTorch, Transformers, PEFT, BitsAndBytes
+## 📊 Model Details
+- **Base Model**: mistralai/Mistral-7B-Instruct-v0.2
+- **Fine-Tuned Model**: [mhdakmal80/Olist-SQL-Agent-Final](https://huggingface.co/mhdakmal80/Olist-SQL-Agent-Final)
+- **Training Method**: QLoRA (4-bit quantization)
+- **Training Data**: 1000+ synthetic question-SQL pairs
+- **Accuracy**: 90% on test set
+## 🎓 About
+This project demonstrates:
+- Fine-tuning large language models (7B parameters)
+- Parameter-efficient fine-tuning with QLoRA
+- Production deployment of ML models
+- Full-stack application development
+Built by [mhdakmal80](https://huggingface.co/mhdakmal80)

app_gradio.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""
+Olist Text-to-SQL Gradio Application
+Gradio interface for the fine-tuned Mistral-7B model.
+"""
+import gradio as gr
+import pandas as pd
+from model_loader import FineTunedModelLoader
+from database import DatabaseHandler
+import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Initialize components
+print("🔄 Initializing model and database...")
+db_path = os.getenv("DATABASE_PATH", "olist.sqlite")
+adapter_path = os.getenv("ADAPTER_PATH", "mhdakmal80/Olist-SQL-Agent-Final")
+db_handler = DatabaseHandler(db_path)
+model_loader = FineTunedModelLoader(adapter_path=adapter_path)
+db_schema = db_handler.get_schema()
+print("✅ Model and database loaded!")
+# Example questions
+EXAMPLES = [
+    ["How many orders are there?"],
+    ["What are the top 5 best-selling products?"],
+    ["Show total revenue by customer state"],
+    ["Which sellers have the highest ratings?"],
+    ["List all orders from São Paulo"],
+    ["What is the average delivery time?"],
+    ["Count customers by state"],
+    ["Show payment types and their usage"],
+]
+def generate_and_execute(question):
+    """
+    Generate SQL from question and execute it.
+    Args:
+        question: Natural language question
+    Returns:
+        Tuple of (sql_query, results_df, status_message)
+    """
+    if not question or not question.strip():
+        return "", None, "⚠️ Please enter a question"
+    # Generate SQL
+    result = model_loader.generate_sql(question, db_schema)
+    if not result['success']:
+        return "", None, f"❌ SQL Generation Failed: {result['error']}"
+    sql_query = result['sql']
+    # Execute query
+    exec_result = db_handler.execute_query(sql_query)
+    if not exec_result['success']:
+        return sql_query, None, f"❌ Query Execution Failed: {exec_result['error']}"
+    # Format results
+    df = exec_result['data']
+    row_count = exec_result['row_count']
+    status = f"✅ Success! Retrieved {row_count} rows"
+    if exec_result.get('warning'):
+        status += f"\n⚠️ {exec_result['warning']}"
+    return sql_query, df, status
+# Create Gradio interface
+with gr.Blocks(title="Olist Text-to-SQL Agent", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🤖 Olist Text-to-SQL Agent
+    Convert natural language questions into SQL queries using a **fine-tuned Mistral-7B model**.
+    **Model**: Mistral-7B-Instruct-v0.2 fine-tuned with QLoRA on Olist e-commerce dataset
+    ⚠️ **Note**: Running on CPU - queries may take 30-60 seconds. For faster performance, the model supports GPU deployment.
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            question_input = gr.Textbox(
+                label="Ask your question",
+                placeholder="e.g., What are the top 10 customers by total spending?",
+                lines=3
+            )
+            with gr.Row():
+                submit_btn = gr.Button("🚀 Generate SQL & Execute", variant="primary")
+                clear_btn = gr.ClearButton([question_input])
+        with gr.Column(scale=1):
+            gr.Markdown("""
+            ### 💡 Example Questions
+            Click any example to try it!
+            """)
+    with gr.Row():
+        sql_output = gr.Code(
+            label="Generated SQL Query",
+            language="sql",
+            lines=5
+        )
+    with gr.Row():
+        status_output = gr.Textbox(
+            label="Status",
+            lines=2
+        )
+    with gr.Row():
+        results_output = gr.Dataframe(
+            label="Query Results",
+            wrap=True,
+            max_height=400
+        )
+    # Examples section
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=question_input,
+        label="Try these examples:"
+    )
+    # Info section
+    with gr.Accordion("ℹ️ About this app", open=False):
+        gr.Markdown("""
+        ### Model Details
+        - **Base Model**: mistralai/Mistral-7B-Instruct-v0.2
+        - **Fine-Tuned Model**: [mhdakmal80/Olist-SQL-Agent-Final](https://huggingface.co/mhdakmal80/Olist-SQL-Agent-Final)
+        - **Training Method**: QLoRA (4-bit quantization)
+        - **Training Data**: 1000+ synthetic question-SQL pairs
+        - **Accuracy**: 90% on test set
+        ### Database
+        - **Dataset**: Olist E-commerce (Brazilian marketplace)
+        - **Tables**: 9 tables with 100K+ orders
+        - **Columns**: Customer info, orders, products, payments, reviews, sellers
+        ### Tech Stack
+        - PyTorch, Transformers, PEFT, BitsAndBytes
+        - Gradio for UI
+        - SQLite for database
+        """)
+    with gr.Accordion("🗄️ Database Schema", open=False):
+        gr.Code(
+            value=db_schema,
+            language="sql",
+            label="Database Schema",
+            lines=20
+        )
+    # Event handlers
+    submit_btn.click(
+        fn=generate_and_execute,
+        inputs=question_input,
+        outputs=[sql_output, results_output, status_output]
+    )
+    question_input.submit(
+        fn=generate_and_execute,
+        inputs=question_input,
+        outputs=[sql_output, results_output, status_output]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

database.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import sqlite3
+import pandas as pd
+from typing import Dict, Any, Optional, List
+class DatabaseHandler:
+    """Handles all database operations for the Olist database."""
+    def __init__(self, db_path: str = "olist.sqlite"):
+        """
+        Initialize database handler.
+        Args:
+            db_path: Path to SQLite database file
+        """
+        self.db_path = db_path
+        self._verify_database()
+    def _verify_database(self):
+        """Verify database exists and is accessible."""
+        try:
+            conn = sqlite3.connect(self.db_path)
+            conn.close()
+        except Exception as e:
+            raise FileNotFoundError(f"Database not found at {self.db_path}: {str(e)}")
+    def get_schema(self) -> str:
+        """
+        Extract and format database schema.
+        Returns:
+            Formatted schema string with all tables and columns
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            # Get all table names
+            cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+            tables = cursor.fetchall()
+            schema_parts = []
+            for table in tables:
+                table_name = table[0]
+                # Get column information
+                cursor.execute(f"PRAGMA table_info({table_name});")
+                columns = cursor.fetchall()
+                # Format table schema
+                schema_parts.append(f"\nTable: {table_name}")
+                schema_parts.append("Columns:")
+                for col in columns:
+                    col_name = col[1]
+                    col_type = col[2]
+                    is_pk = " (PRIMARY KEY)" if col[5] else ""
+                    schema_parts.append(f"  - {col_name} ({col_type}){is_pk}")
+            conn.close()
+            return "\n".join(schema_parts)
+        except Exception as e:
+            return f"Error extracting schema: {str(e)}"
+    def execute_query(self, sql: str, max_rows: int = 1000) -> Dict[str, Any]:
+        """
+        Execute SQL query and return results.
+        Args:
+            sql: SQL query to execute
+            max_rows: Maximum number of rows to return
+        Returns:
+            Dictionary with:
+                - success: Boolean indicating success
+                - data: Pandas DataFrame with results
+                - row_count: Number of rows returned
+                - error: Error message if failed
+        """
+        # Validate query first
+        if not self._validate_query(sql):
+            return {
+                "success": False,
+                "data": None,
+                "row_count": 0,
+                "error": "Query validation failed: Only SELECT queries are allowed"
+            }
+        try:
+            conn = sqlite3.connect(self.db_path)
+            # Execute query and fetch results
+            df = pd.read_sql_query(sql, conn)
+            # Limit rows if needed
+            if len(df) > max_rows:
+                df = df.head(max_rows)
+                warning = f"Results limited to {max_rows} rows"
+            else:
+                warning = None
+            conn.close()
+            return {
+                "success": True,
+                "data": df,
+                "row_count": len(df),
+                "error": None,
+                "warning": warning
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "data": None,
+                "row_count": 0,
+                "error": f"Query execution error: {str(e)}"
+            }
+    def _validate_query(self, sql: str) -> bool:
+        """
+        Validate SQL query for safety.
+        Args:
+            sql: SQL query to validate
+        Returns:
+            True if query is safe, False otherwise
+        """
+        sql_upper = sql.upper().strip()
+        # Only allow SELECT queries
+        if not sql_upper.startswith("SELECT"):
+            return False
+        # Block dangerous keywords
+        dangerous_keywords = [
+            "DROP", "DELETE", "INSERT", "UPDATE",
+            "ALTER", "CREATE", "TRUNCATE", "REPLACE"
+        ]
+        for keyword in dangerous_keywords:
+            if keyword in sql_upper:
+                return False
+        return True
+    def get_table_names(self) -> List[str]:
+        """
+        Get list of all table names in database.
+        Returns:
+            List of table names
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+            tables = [row[0] for row in cursor.fetchall()]
+            conn.close()
+            return tables
+        except Exception as e:
+            print(f"Error getting table names: {e}")
+            return []
+    def get_table_preview(self, table_name: str, limit: int = 5) -> Optional[pd.DataFrame]:
+        """
+        Get preview of table data.
+        Args:
+            table_name: Name of table to preview
+            limit: Number of rows to return
+        Returns:
+            DataFrame with sample data or None if error
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            df = pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT {limit};", conn)
+            conn.close()
+            return df
+        except Exception as e:
+            print(f"Error previewing table {table_name}: {e}")
+            return None
+# Test function
+if __name__ == "__main__":
+    # Quick test
+    db = DatabaseHandler("olist.sqlite")
+    print("=== Database Schema ===")
+    print(db.get_schema())
+    print("\n=== Table Names ===")
+    print(db.get_table_names())
+    print("\n=== Test Query ===")
+    result = db.execute_query("SELECT COUNT(*) as total_orders FROM orders;")
+    print(f"Success: {result['success']}")
+    if result['success']:
+        print(result['data'])

model_loader.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import PeftModel
+from typing import Dict, Any, Optional
+import re
+class FineTunedModelLoader:
+    """Loads and manages the fine-tuned Mistral-7B model."""
+    def __init__(self,
+                 base_model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
+                 adapter_path: str = "mhdakmal80/Olist-SQL-Agent-Final",
+                 use_4bit: bool = True):
+        """
+        Initialize the fine-tuned model.
+        Args:
+            base_model_name: HuggingFace model name
+            adapter_path: Path to LoRA adapter weights
+            use_4bit: Whether to use 4-bit quantization
+        """
+        self.base_model_name = base_model_name
+        self.adapter_path = adapter_path
+        self.use_4bit = use_4bit
+        print(" Loading fine-tuned model...")
+        self.model, self.tokenizer = self._load_model()
+        print(" Model loaded successfully!")
+    def _load_model(self):
+        """Load the base model and LoRA adapters."""
+        # Configure 4-bit quantization if enabled
+        if self.use_4bit:
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16,
+                bnb_4bit_use_double_quant=False,
+            )
+        else:
+            bnb_config = None
+        # Load base model
+        print(f"  Loading base model: {self.base_model_name}")
+        base_model = AutoModelForCausalLM.from_pretrained(
+            self.base_model_name,
+            quantization_config=bnb_config if self.use_4bit else None,
+            torch_dtype=torch.bfloat16 if not self.use_4bit else None,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+        # Load tokenizer
+        print(f"  Loading tokenizer")
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.base_model_name,
+            trust_remote_code=True
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "right"
+        # Load LoRA adapter
+        print(f"  Loading LoRA adapter from: {self.adapter_path}")
+        model = PeftModel.from_pretrained(base_model, self.adapter_path)
+        return model, tokenizer
+    def generate_sql(self, question: str, schema: str) -> Dict[str, Any]:
+        """
+        Generate SQL query from natural language question.
+        Args:
+            question: User's natural language question
+            schema: Database schema as string
+        Returns:
+            Dictionary with 'sql', 'success', and 'error' keys
+        """
+        # Format prompt
+        prompt = f"""[INST]You are a SQL expert. Generate a valid SQLite query using ONLY the columns and tables listed below.
+Don't ever use columns that is not in the schema (this need to be followed strictly).Always try to come up the
+solution based on provided schema only.
+### Available Tables and Columns:
+{schema}
+### IMPORTANT:
+- Use ONLY the column names listed above
+- Do NOT invent column names
+- Do NOT use columns that don't exist
+### Question:
+{question}
+### Generate SQL using only the columns listed above:
+[/INST]```sql
+"""
+        try:
+            # Tokenize
+            inputs = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=512
+            )
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Generate
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=256,
+                    temperature=0.1,
+                    do_sample=False,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                )
+            # Decode
+            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract SQL from response
+            sql_query = self._extract_sql(generated_text, prompt)
+            return {
+                "sql": sql_query,
+                "success": True,
+                "error": None
+            }
+        except Exception as e:
+            return {
+                "sql": "",
+                "success": False,
+                "error": f"Model Error: {str(e)}"
+            }
+    def _extract_sql(self, generated_text: str, prompt: str) -> str:
+        """
+        Extract SQL query from generated text.
+        Args:
+            generated_text: Full generated text from model
+            prompt: Original prompt (to remove from output)
+        Returns:
+            Cleaned SQL query
+        """
+        # Remove the prompt from the generated text
+        sql = generated_text.replace(prompt, "").strip()
+        # Try to extract SQL after "### SQL Query:" marker
+        patterns = [
+            r"### SQL Query:\s*(.+?)(?:###|$)",
+            r"```sql\s*(.+?)\s*```",
+            r"SELECT\s+.+",
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, sql, re.IGNORECASE | re.DOTALL)
+            if match:
+                sql = match.group(1) if match.lastindex else match.group(0)
+                break
+        # Clean up
+        sql = sql.replace("```sql", "").replace("```", "")
+        sql = " ".join(sql.split())  # Remove extra whitespace
+        sql = sql.strip()
+        # Ensure it ends with semicolon
+        if not sql.endswith(";"):
+            sql += ";"
+        return sql
+# Test function
+if __name__ == "__main__":
+    # Quick test
+    model_loader = FineTunedModelLoader()
+    test_schema = """
+    Table: orders
+    Columns: order_id, customer_id, order_status, order_purchase_timestamp
+    """
+    result = model_loader.generate_sql(
+        "How many orders are there?",
+        test_schema
+    )
+    print(f"\nSuccess: {result['success']}")
+    print(f"SQL: {result['sql']}")
+    if result['error']:
+        print(f"Error: {result['error']}")

olist.sqlite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49446afd935721ee12fc95316fbee9666a3e1bd4872dfa194fe4625d6762a81a
+size 112701440

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# Core dependencies
+gradio>=4.0.0
+python-dotenv==1.0.0
+pandas==2.1.4
+# ML/AI dependencies for fine-tuned model
+torch>=2.0.0
+transformers>=4.35.0
+accelerate>=0.24.0
+peft>=0.6.0
+bitsandbytes>=0.41.0
+sentencepiece>=0.1.99