Spaces:

anjalirathore
/

Startup_SQL_Query_Generator

Sleeping

App Files Files Community

anjalirathore commited on Oct 17, 2025

Commit

5f9b2f9

verified ·

1 Parent(s): 4e35995

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -38

app.py CHANGED Viewed

@@ -3,13 +3,22 @@ import os
 from sentence_transformers import SentenceTransformer
 import gradio as gr
 from sklearn.metrics.pairwise import cosine_similarity
-from groq import Groq
 import pandas as pd
 load_dotenv()
-groq_api_key = os.getenv("groq_api_key")
-dataset_folder = "./data"
 if not os.path.exists(dataset_folder):
     print(f"Warning: Dataset folder '{dataset_folder}' not found. Using current directory instead.")
     dataset_folder = "."  # Fallback: Look in the current directory
@@ -25,28 +34,26 @@ warnings.simplefilter("ignore", category=pd.errors.DtypeWarning)
 # Load all CSV files in the dataset folder
 dataframes = []
 for file in os.listdir(dataset_folder):
-    if file.endswith(".csv"):
         try:
-            # Read first few rows to identify column names
-            sample_df = pd.read_csv(
-                os.path.join(dataset_folder, file),
-                nrows=5,  # Read only first 5 rows for column type inference
-                encoding="utf-8",
-                errors="replace"  # Replace encoding errors with a placeholder
-            )
             column_types = {col: str for col in sample_df.columns}  # Force all columns to string
-            # Read the entire file with enforced column types
-            df = pd.read_csv(
-                os.path.join(dataset_folder, file),
-                dtype=column_types,  # Apply enforced string types
-                low_memory=False,  # Avoid chunk-based reading issues
-                encoding="utf-8",
-                errors="replace"
-            ).fillna('')  # Fill NaN values with empty strings
-            dataframes.append(df)  # Append DataFrame to the list
         except Exception as e:
             print(f"Error reading {file}: {e}")
@@ -120,33 +127,32 @@ def create_prompt(user_query, table_metadata):
 def generate_sql_query(system_prompt):
-    """Uses Groq API to generate an SQL query with better debugging."""
     try:
-        client = Groq(api_key=groq_api_key)
-        chat_completion = client.chat.completions.create(
-            messages=[{"role": "system", "content": system_prompt}],
-            model="llama3-70b-8192"
-        )
-        # Debug: Print entire response
-        print("🔍 Full API Response:", chat_completion)
-        # Extract AI response
-        result = chat_completion.choices[0].message.content.strip()
-        print(f"✅ AI Response: {result}")  # Debugging
-        # Check if the response starts with "SELECT"
         if result.lower().startswith("select"):
             return result
         else:
-            print("⚠️ AI did not generate a valid SQL query!")
-            return "⚠️ AI response is not a valid SQL query."
     except Exception as e:
-        print(f"❌ API Error: {e}")
         return "⚠️ API failed. Check logs."
 def response(user_query, dataset_folder):
     """Processes the user query and returns an SQL query."""
     dataframes, metadata_list = load_dataset_metadata(dataset_folder)

 from sentence_transformers import SentenceTransformer
 import gradio as gr
 from sklearn.metrics.pairwise import cosine_similarity
+import google.generativeai as genai
+import os
+from dotenv import load_dotenv
 import pandas as pd
 load_dotenv()
+# Add this: read the API key from env and warn if missing
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+if not GEMINI_API_KEY:
+    print("Warning: GEMINI_API_KEY not set in environment. Set it in your .env file or system env vars.")
+genai.configure(api_key=GEMINI_API_KEY)
+# Use the current directory for Hugging Face Spaces
+dataset_folder = "./data" # Assuming files are in a 'data/' folder
+# Verify the folder exists
 if not os.path.exists(dataset_folder):
     print(f"Warning: Dataset folder '{dataset_folder}' not found. Using current directory instead.")
     dataset_folder = "."  # Fallback: Look in the current directory
 # Load all CSV files in the dataset folder
 dataframes = []
 for file in os.listdir(dataset_folder):
+    if file.endswith(".csv"):  # Check if the file is a CSV
         try:
+            path = os.path.join(dataset_folder, file)
+            # Try reading with utf-8, fallback to latin1 if encoding fails
+            try:
+                sample_df = pd.read_csv(path, nrows=5, encoding="utf-8")
+            except UnicodeDecodeError:
+                sample_df = pd.read_csv(path, nrows=5, encoding="latin1")
             column_types = {col: str for col in sample_df.columns}  # Force all columns to string
+            try:
+                df = pd.read_csv(path, dtype=column_types, low_memory=False, encoding="utf-8")
+            except UnicodeDecodeError:
+                df = pd.read_csv(path, dtype=column_types, low_memory=False, encoding="latin1")
+            df = df.fillna('')  # Fill NaN values with empty strings
+            dataframes.append(df)
         except Exception as e:
             print(f"Error reading {file}: {e}")
 def generate_sql_query(system_prompt):
+    """Uses Gemini API to generate an SQL query."""
     try:
+        # Initialize the Gemini model (use a reliable text model)
+        model = genai.GenerativeModel("gemini-2.5-pro")
+        # Generate content from the system prompt
+        response = model.generate_content(system_prompt)
+        # Debug: print the full Gemini response
+        print("🔍 Full API Response:", response)
+        # Extract AI text response
+        result = response.text.strip()
+        print(f"✅ AI Response: {result}")
+        # Validate SQL query
         if result.lower().startswith("select"):
             return result
         else:
+            print("⚠️ Gemini did not generate a valid SQL query.")
+            return "⚠️ Invalid SQL query generated."
     except Exception as e:
+        print(f"❌ Gemini API Error: {e}")
         return "⚠️ API failed. Check logs."
 def response(user_query, dataset_folder):
     """Processes the user query and returns an SQL query."""
     dataframes, metadata_list = load_dataset_metadata(dataset_folder)