Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
app.py +104 -0
data.db +3 -0
database.py +88 -0
requirements.txt +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data.db filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import streamlit as st
+import sqlite3
+import pandas as pd
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import database
+import json
+# Initialize database
+database.init_database()
+# Get schema information
+schema_info = database.get_schema_info()
+# Initialize the model and tokenizer
+@st.cache_resource
+def load_model():
+    tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
+    model = AutoModelForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf")
+    return model, tokenizer
+def create_schema_prompt():
+    prompt = "Database Schema:\n"
+    for table, info in schema_info.items():
+        prompt += f"\nTable: {table}\n"
+        prompt += "Columns:\n"
+        for col, type_ in zip(info['columns'], info['types']):
+            sample_values = info['sample_values'][col][:3]  # Take first 3 sample values
+            prompt += f"- {col} ({type_}), Example values: {', '.join(map(str, sample_values))}\n"
+    return prompt
+def generate_sql_query(question):
+    model, tokenizer = load_model()
+    # Create detailed prompt with schema information
+    schema_prompt = create_schema_prompt()
+    prompt = f"""Given the following database schema and question, generate a SQL query that answers the question.
+{schema_prompt}
+Question: {question}
+Write only the SQL query without any additional text or explanation. Make sure to:
+1. Use the correct table and column names as shown in the schema
+2. Handle joins appropriately if multiple tables are needed
+3. Use appropriate SQL functions based on the question context
+SQL Query:"""
+    # Generate SQL query
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
+    outputs = model.generate(
+        **inputs,
+        max_length=500,
+        num_return_sequences=1,
+        temperature=0.7,
+        top_p=0.95,
+        do_sample=True
+    )
+    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract only the SQL part
+    sql_query = sql_query.split("SQL Query:")[-1].strip()
+    return sql_query
+def execute_query(query):
+    conn = sqlite3.connect('data.db')
+    try:
+        result = pd.read_sql_query(query, conn)
+        return result, None
+    except Exception as e:
+        return None, str(e)
+    finally:
+        conn.close()
+# Streamlit UI
+st.title("Intelligent Text to SQL Query Assistant")
+st.write("Ask questions about your data in natural language!")
+# Display schema information in expandable section
+with st.expander("View Database Schema"):
+    st.code(create_schema_prompt(), language="text")
+# User input
+user_question = st.text_area("Enter your question:", height=100)
+if st.button("Generate and Execute Query"):
+    if user_question:
+        with st.spinner("Generating SQL query..."):
+            # Generate SQL query
+            sql_query = generate_sql_query(user_question)
+            # Display the generated query
+            st.subheader("Generated SQL Query:")
+            st.code(sql_query, language="sql")
+            # Execute the query
+            with st.spinner("Executing query..."):
+                results, error = execute_query(sql_query)
+                if error:
+                    st.error(f"Error executing query: {error}")
+                else:
+                    st.subheader("Query Results:")
+                    st.dataframe(results)

data.db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:426aac0b39b5ca888406c7471807463f82bc25b68e8568a6af16456cae01abf0
+size 438272

database.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import sqlite3
+import pandas as pd
+import os
+from huggingface_hub import hf_hub_download
+import io
+import requests
+from io import StringIO
+def download_dataset(url):
+    # Convert URL to raw content URL
+    raw_url = url.replace('blob/', '')
+    raw_url = raw_url.replace('https://huggingface.co/', 'https://huggingface.co/')
+    raw_url = raw_url.replace('/tree/main', '/resolve/main')
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    # Download the data
+    response = requests.get(raw_url, headers=headers)
+    response.raise_for_status()  # Raise an exception for bad status codes
+    # Read CSV data
+    return pd.read_csv(StringIO(response.text))
+def init_database():
+    # Create database connection
+    conn = sqlite3.connect('data.db')
+    try:
+        # Download files from Hugging Face
+        bonus_data_path = hf_hub_download(
+            repo_id="AIforAll16011991/bonus_data",
+            filename="Bonus_Data.csv",
+            repo_type="dataset"
+        )
+        player_kpi_path = hf_hub_download(
+            repo_id="AIforAll16011991/bonus_data",
+            filename="Player_KPIs.csv",
+            repo_type="dataset"
+        )
+        # Read CSV files
+        bonus_data = pd.read_csv(bonus_data_path)
+        player_kpi = pd.read_csv(player_kpi_path)
+        # Write to SQLite database
+        bonus_data.to_sql('bonus_data', conn, if_exists='replace', index=False)
+        player_kpi.to_sql('player_kpi', conn, if_exists='replace', index=False)
+        print("Database initialized successfully with data from Hugging Face!")
+    except Exception as e:
+        print(f"Error initializing database: {str(e)}")
+        raise
+    finally:
+        conn.close()
+def get_schema_info():
+    conn = sqlite3.connect('data.db')
+    cursor = conn.cursor()
+    schema_info = {}
+    # Get table information
+    tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
+    for table in tables:
+        table_name = table[0]
+        # Get column information
+        columns = cursor.execute(f"PRAGMA table_info({table_name});").fetchall()
+        # Get sample data for each column
+        sample_data = pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT 5", conn)
+        # Store column information and data types
+        schema_info[table_name] = {
+            'columns': [col[1] for col in columns],
+            'types': [col[2] for col in columns],
+            'sample_values': {col: sample_data[col].tolist() for col in sample_data.columns}
+        }
+    conn.close()
+    return schema_info
+if __name__ == "__main__":
+    init_database()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit==1.31.1
+transformers==4.37.2
+torch==2.2.0
+pandas==2.2.0
+sqlite3-utils==3.35.2
+requests==2.31.0
+huggingface_hub==0.21.4