Spaces:

kkhushisaid
/

Startup_Investment_Query_Generator

Sleeping

App Files Files Community

kkhushisaid commited on Mar 16, 2025

Commit

2116f61

verified ·

1 Parent(s): 3423ae7

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -0

app.py CHANGED Viewed

	@@ -0,0 +1,89 @@

+from dotenv import load_dotenv
+import os
+from sentence_transformers import SentenceTransformer
+import gradio as gr
+from sklearn.metrics.pairwise import cosine_similarity
+from groq import Groq
+import pandas as pd
+load_dotenv()
+groq_api_key = os.getenv("groq_api_key")
+def load_dataset_metadata(dataset_folder):
+    """Loads metadata from all CSV files in the dataset folder."""
+    dataframes = []
+    metadata_list = []
+    for file in os.listdir(dataset_folder):
+        if file.endswith(".csv"):
+            df = pd.read_csv(os.path.join(dataset_folder, file))
+            dataframes.append((file, df))
+            # Generate table metadata
+            columns = df.columns.tolist()
+            table_metadata = f"""
+            Table: {file.replace('.csv', '')}
+            Columns:
+            {', '.join(columns)}
+            """
+            metadata_list.append(table_metadata)
+    return dataframes, metadata_list
+def create_metadata_embeddings(metadata_list):
+    """Creates embeddings for all table metadata."""
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    embeddings = model.encode(metadata_list)
+    return embeddings, model
+def find_best_fit(embeddings, model, user_query, metadata_list):
+    """Finds the best matching table based on user query."""
+    query_embedding = model.encode([user_query])
+    similarities = cosine_similarity(query_embedding, embeddings)
+    best_match_index = similarities.argmax()
+    return metadata_list[best_match_index]
+def create_prompt(user_query, table_metadata):
+    """Generates a prompt for the AI model."""
+    system_prompt = """
+    You are a SQL query generator specialized in generating SQL queries for a single table at a time.
+    Your task is to accurately convert natural language queries into SQL statements based on the user's intent and the provided table metadata.
+    Rules:
+    - Assume all queries relate to a single table provided in the metadata. Ignore references to other tables.
+    - Ensure the generated query matches the table name, columns, and data types in the metadata.
+    - Capture filters, sorting, or aggregations as per user intent.
+    - Use standard SQL syntax.
+    Input:
+    User Query: {user_query}
+    Table Metadata: {table_metadata}
+    Output:
+    - Provide only the SQL query in a single line. No extra words.
+    """
+    return system_prompt
+def generate_sql_query(system_prompt):
+    """Uses Groq API to generate an SQL query."""
+    client = Groq(api_key=groq_api_key)
+    chat_completion = client.chat.completions.create(
+        messages=[{"role": "system", "content": system_prompt}],
+        model="llama3-70b-8192"
+    )
+    result = chat_completion.choices[0].message.content.strip()
+    return result if result.lower().startswith("select") else "Can't perform the task at the moment."
+def response(user_query, dataset_folder):
+    """Processes the user query and returns an SQL query."""
+    dataframes, metadata_list = load_dataset_metadata(dataset_folder)
+    embeddings, model = create_metadata_embeddings(metadata_list)
+    table_metadata = find_best_fit(embeddings, model, user_query, metadata_list)
+    system_prompt = create_prompt(user_query, table_metadata)
+    return generate_sql_query(system_prompt)
+# Example usage:
+dataset_folder = r"C:\\Users\\khuma\\startups"
+user_query = "Show me the top 10 startups with the highest funding."
+print(response(user_query, dataset_folder))