|
|
""" |
|
|
SQL-based Feedback Analysis Service |
|
|
|
|
|
This module implements a SQL-based approach to analyzing feedback: |
|
|
1. LLM analyzes user query |
|
|
2. LLM generates 1-5 SQL queries to answer the question |
|
|
3. Execute SQL queries on the feedback DataFrame |
|
|
4. LLM synthesizes a comprehensive answer from query + SQL queries + results |
|
|
5. (Optional) Generate visualizations of the results |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import json |
|
|
import re |
|
|
from dataclasses import dataclass |
|
|
from typing import List, Dict, Any, Optional |
|
|
import pandas as pd |
|
|
import sqlite3 |
|
|
|
|
|
from .config import settings |
|
|
from .data_loader import load_feedback |
|
|
|
|
|
try: |
|
|
from openai import OpenAI |
|
|
except Exception: |
|
|
OpenAI = None |
|
|
|
|
|
try: |
|
|
import google.generativeai as genai |
|
|
except Exception: |
|
|
genai = None |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class SQLQueryResult: |
|
|
""" |
|
|
Result of a single SQL query execution. |
|
|
|
|
|
Attributes: |
|
|
query: The SQL query that was executed |
|
|
result: DataFrame containing the query results (empty if error occurred) |
|
|
error: Error message if query failed, None if successful |
|
|
""" |
|
|
query: str |
|
|
result: pd.DataFrame |
|
|
error: Optional[str] = None |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class AnalysisResult: |
|
|
""" |
|
|
Complete analysis result from processing a user query. |
|
|
|
|
|
Attributes: |
|
|
user_query: The original question asked by the user |
|
|
sql_queries: List of SQL queries that were generated and executed |
|
|
query_results: Results from executing each SQL query |
|
|
summary: Final synthesized answer in natural language |
|
|
visualizations: Optional list of visualization specifications for frontend rendering |
|
|
""" |
|
|
user_query: str |
|
|
sql_queries: List[str] |
|
|
query_results: List[SQLQueryResult] |
|
|
summary: str |
|
|
visualizations: Optional[List[Dict[str, Any]]] = None |
|
|
|
|
|
|
|
|
class SQLFeedbackService: |
|
|
""" |
|
|
Main service for SQL-based feedback analysis. |
|
|
|
|
|
This service implements a 4-stage pipeline: |
|
|
1. Generate SQL queries from natural language questions (using LLM) |
|
|
2. Execute SQL queries on feedback data (using SQLite in-memory) |
|
|
3. Synthesize comprehensive answers from query results (using LLM) |
|
|
4. Generate visualization specifications for results |
|
|
|
|
|
The service also includes automatic quality evaluation and improvement |
|
|
of generated answers to ensure high-quality responses. |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
""" |
|
|
Initialize the SQL feedback service. |
|
|
|
|
|
Loads feedback data from CSV into memory. If loading fails, |
|
|
the service will still initialize but will raise errors when |
|
|
trying to process queries. |
|
|
""" |
|
|
self.df: Optional[pd.DataFrame] = None |
|
|
self._load_data() |
|
|
|
|
|
def _load_data(self) -> None: |
|
|
""" |
|
|
Load feedback data from CSV file into memory. |
|
|
|
|
|
The data is loaded once at initialization and kept in memory |
|
|
for fast query execution. If the CSV file is missing or invalid, |
|
|
an error is logged but the service continues to initialize. |
|
|
|
|
|
Raises: |
|
|
FileNotFoundError: If CSV file doesn't exist (handled internally) |
|
|
ValueError: If CSV is missing required columns (handled internally) |
|
|
""" |
|
|
try: |
|
|
from .config import settings |
|
|
self.df = load_feedback() |
|
|
csv_path_used = settings.csv_path |
|
|
print(f"โ
Loaded {len(self.df)} feedback records from: {csv_path_used}", flush=True) |
|
|
except Exception as e: |
|
|
print(f"โ Error loading feedback data: {e}", flush=True) |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
self.df = None |
|
|
|
|
|
def _get_schema_info(self) -> str: |
|
|
""" |
|
|
Generate comprehensive schema information for the feedback table. |
|
|
|
|
|
This function analyzes the actual CSV file structure and provides |
|
|
detailed information about each field including business meaning, |
|
|
data types, examples, and usage patterns. |
|
|
|
|
|
Returns: |
|
|
A detailed formatted string describing the table schema with |
|
|
business context, examples, and statistics. |
|
|
""" |
|
|
if self.df is None: |
|
|
return "No data available" |
|
|
|
|
|
|
|
|
def get_sample_values(col_name, n=5): |
|
|
try: |
|
|
samples = self.df[col_name].dropna().head(n).tolist() |
|
|
return [str(s) for s in samples] |
|
|
except: |
|
|
return [] |
|
|
|
|
|
|
|
|
def get_unique_values(col_name, max_show=10): |
|
|
try: |
|
|
unique_vals = self.df[col_name].dropna().unique().tolist() |
|
|
if len(unique_vals) <= max_show: |
|
|
return unique_vals |
|
|
return unique_vals[:max_show] |
|
|
except: |
|
|
return [] |
|
|
|
|
|
all_columns_info = "" |
|
|
|
|
|
|
|
|
for col in self.df.columns: |
|
|
col_info = "" |
|
|
|
|
|
|
|
|
dtype = str(self.df[col].dtype) |
|
|
non_null_count = self.df[col].notna().sum() |
|
|
null_count = self.df[col].isna().sum() |
|
|
samples = get_sample_values(col, 3) |
|
|
|
|
|
|
|
|
if col == 'ID': |
|
|
col_info = f"โข {col} (UUID/ืืงืกื): ืืืื ืืืืืื ืืืืืื ืฉื ืื ืืฉืื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืื ืืืืืื ืืื ืืฉืื ืืืขืจืืช, ืืืคืฉืจ ืืขืงื, ืงืืฉืืจ ืืื ืืฉืืืื, ืืื ืืขืช ืืคืืืืืืช\n" |
|
|
col_info += f" - ืืืืืืืช: {', '.join(samples[:2])}\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE ID = '...', COUNT(DISTINCT ID), GROUP BY ID\n" |
|
|
|
|
|
|
|
|
elif col == 'ServiceName': |
|
|
unique_services = self.df[col].nunique() |
|
|
unique_samples = get_unique_values(col, 5) |
|
|
col_info = f"โข {col} (ืืงืกื): ืฉื ืืฉืืจืืช ืืืืืืืื ืืืืฉืืชื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืื ืืช ืืฉืืจืืช ืฉืขืืื ื ืืชื ืืืฉืื. ืืืคืฉืจ ื ืืชืื ืืคื ืฉืืจืืช, ืืฉืืืื ืืื ืฉืืจืืชืื, ืืืืื ืฉืืจืืชืื ืืขืืืชืืื ืื ืืฆืืืื ืื\n" |
|
|
col_info += f" - ืืฉ {unique_services} ืฉืืจืืชืื ืืืืืืืื ืืืขืจืืช\n" |
|
|
col_info += f" - ืืืืืืืช: {', '.join(unique_samples[:3])}\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE ServiceName = '...', GROUP BY ServiceName, COUNT(*) GROUP BY ServiceName\n" |
|
|
|
|
|
|
|
|
elif col == 'Level': |
|
|
level_dist = self.df[col].value_counts().sort_index().to_dict() |
|
|
avg_level = self.df[col].mean() |
|
|
col_info = f"โข {col} (ืืกืคืจ ืฉืื 1-5): ืืืจืื ืฉืืืขืืช ืจืฆืื ืืืฉืชืืฉ ืืืฉืืจืืช\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืื ืฉืืืขืืช ืจืฆืื. 1=ืืจืืข ืืืื, 2=ืืจืืข, 3=ืืื ืื ื, 4=ืืื, 5=ืืขืืื. ืืืคืฉืจ ืืืืืช ืฉืืืขืืช ืจืฆืื, ืืืืื ืืขืืืช, ืืืขืงื ืืืจ ืฉืืคืืจืื\n" |
|
|
col_info += f" - ืืืจืื ืืืืฆืข: {avg_level:.2f}\n" |
|
|
col_info += f" - ืืืืงื: {level_dist}\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE Level >= 4 (ืืฉืืืื ืืืืืืื), WHERE Level <= 2 (ืืฉืืืื ืฉืืืืืื), AVG(Level), GROUP BY Level\n" |
|
|
|
|
|
|
|
|
elif col == 'Text': |
|
|
sample_text = samples[0][:80] + "..." if samples and len(samples[0]) > 80 else (samples[0] if samples else "") |
|
|
avg_length = self.df[col].str.len().mean() if self.df[col].dtype == 'object' else 0 |
|
|
col_info = f"โข {col} (ืืงืกื ืืจืื): ืืชืืื ืืืืคืฉื ืฉื ืืืฉืื ืืืืฉืชืืฉ\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืงืืจืืช, ืืฆืขืืช ืืฉืืคืืจ, ืชืืื ืืช, ืืืืืืช. ืืืคืฉืจ ื ืืชืื ืืืืืชื, ืืืืื ื ืืฉืืื ืืืืจืื, ืืงืืืช ืชืืื ืืช ืขืกืงืืืช\n" |
|
|
col_info += f" - ืืืจื ืืืืฆืข: {avg_length:.0f} ืชืืืื\n" |
|
|
col_info += f" - ืืืืื: '{sample_text}'\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE Text LIKE '%ืืืื%', WHERE Text LIKE '%ืืขืื%', LENGTH(Text), COUNT(*) WHERE Text IS NOT NULL\n" |
|
|
|
|
|
|
|
|
elif col == 'ReferenceNumber': |
|
|
if non_null_count > 0: |
|
|
ref_min = int(self.df[col].min()) |
|
|
ref_max = int(self.df[col].max()) |
|
|
col_info = f"โข {col} (ืืกืคืจ ืฉืื): ืืกืคืจ ืืคื ืื ืคื ืืื ืฉื ืืืฉืื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืกืคืจ ืืคื ืื ืืืขืจืืช. ืืืคืฉืจ ืงืืฉืืจ ืืืกืืืื ืื ืืงืฉืืช ืงืฉืืจืืช, ืืขืงื ืืืจ ืชืืืืืื, ืื ืืืื ืืงืฉืืช\n" |
|
|
col_info += f" - ืืืื: {ref_min} - {ref_max}\n" |
|
|
col_info += f" - ืืืืืืืช: {', '.join([str(s) for s in samples[:2]])}\n" |
|
|
col_info += f" - NULL: {null_count} ืจืฉืืืืช ({null_count/len(self.df)*100:.1f}%)\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE ReferenceNumber = 6928, WHERE ReferenceNumber IS NOT NULL\n" |
|
|
|
|
|
|
|
|
elif col == 'RequestID': |
|
|
if non_null_count > 0: |
|
|
col_info = f"โข {col} (UUID/ืืงืกื): ืืืื ืืืืืื ืฉื ืืืงืฉื ืืืงืืจืืช ืฉืงืฉืืจื ืืืฉืื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืคืฉืจ ืงืืฉืืจ ืืื ืืงืฉืืช ืืืฉืืืื, ืืขืงื ืืืจ ืชืืืืืื, ืื ืืชืื ืืงืฉืจ ืืื ืืงืฉื ืืืฉืื\n" |
|
|
col_info += f" - ืืืืืืืช: {samples[0][:30]}...\n" |
|
|
col_info += f" - NULL: {null_count} ืจืฉืืืืช ({null_count/len(self.df)*100:.1f}%)\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE RequestID = '...', COUNT(DISTINCT RequestID)\n" |
|
|
|
|
|
|
|
|
elif col == 'ProcessID': |
|
|
col_info = f"โข {col} (UUID/ืืงืกื): ืืืื ืืืืืื ืฉื ืืชืืืื ืืขืกืงื ืฉืงืฉืืจ ืืืฉืื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืคืฉืจ ื ืืชืื ืืคื ืชืืืืืื, ืืืืื ืชืืืืืื ืืขืืืชืืื, ืืืขืงื ืืืจ ืืืฆืืขืื\n" |
|
|
col_info += f" - NULL: {null_count} ืจืฉืืืืช ({null_count/len(self.df)*100:.1f}%)\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE ProcessID = '...', COUNT(DISTINCT ProcessID)\n" |
|
|
|
|
|
|
|
|
elif col == 'Year': |
|
|
year_min = int(self.df[col].min()) |
|
|
year_max = int(self.df[col].max()) |
|
|
year_dist = self.df[col].value_counts().sort_index().to_dict() |
|
|
col_info = f"โข {col} (ืืกืคืจ ืฉืื): ืฉื ื ืฉืื ื ืืชื ืืืฉืื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืคืฉืจ ื ืืชืื ืืืืืช ืืืืจื ืฉื ืื, ืืฉืืืื ืืื ืฉื ืื, ืืืืื ืฉืืคืืจืื ืื ืืืืจืืจืืช, ืืชืื ืื ืืกืืจืืื\n" |
|
|
col_info += f" - ืืืื: {year_min} - {year_max}\n" |
|
|
col_info += f" - ืืืืงื: {year_dist}\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE Year = 2020, GROUP BY Year, SELECT Year, COUNT(*) GROUP BY Year\n" |
|
|
|
|
|
|
|
|
elif col == 'Month': |
|
|
month_min = int(self.df[col].min()) |
|
|
month_max = int(self.df[col].max()) |
|
|
month_names = {1: 'ืื ืืืจ', 2: 'ืคืืจืืืจ', 3: 'ืืจืฅ', 4: 'ืืคืจืื', 5: 'ืืื', 6: 'ืืื ื', |
|
|
7: 'ืืืื', 8: 'ืืืืืกื', 9: 'ืกืคืืืืจ', 10: 'ืืืงืืืืจ', 11: 'ื ืืืืืจ', 12: 'ืืฆืืืจ'} |
|
|
col_info = f"โข {col} (ืืกืคืจ ืฉืื 1-12): ืืืืฉ ืืฉื ื ืฉืื ื ืืชื ืืืฉืื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืคืฉืจ ื ืืชืื ืขืื ืชื, ืืืืื ืืืืฉืื ืืขืืืชืืื ืื ืืฆืืืื ืื, ืชืื ืื ืืฉืืืื ืืคื ืขืื ืืช\n" |
|
|
col_info += f" - ืืืื: {month_min} - {month_max} ({month_names.get(month_min, '')} - {month_names.get(month_max, '')})\n" |
|
|
col_info += f" - ืืืืืืืช: {', '.join([str(s) for s in samples[:3]])}\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE Month = 1, GROUP BY Month, SELECT Month, COUNT(*) GROUP BY Month ORDER BY Month\n" |
|
|
|
|
|
|
|
|
elif col == 'DayInMonth': |
|
|
day_min = int(self.df[col].min()) |
|
|
day_max = int(self.df[col].max()) |
|
|
col_info = f"โข {col} (ืืกืคืจ ืฉืื 1-31): ืืื ืืืืืฉ ืฉืื ื ืืชื ืืืฉืื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืคืฉืจ ื ืืชืื ืืคื ืืืื ืืืืืฉ, ืืืืื ืืืื ืืขืืืชืืื (ืืืฉื ืกืืฃ ืืืืฉ), ืื ืืชืื ืืคืืกืื ืืืืืื\n" |
|
|
col_info += f" - ืืืื: {day_min} - {day_max}\n" |
|
|
col_info += f" - ืืืืืืืช: {', '.join([str(s) for s in samples[:3]])}\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE DayInMonth = 1, GROUP BY DayInMonth, SELECT DayInMonth, COUNT(*) GROUP BY DayInMonth\n" |
|
|
|
|
|
|
|
|
elif col == 'DayOfWeek': |
|
|
unique_days = get_unique_values(col, 10) |
|
|
day_names_he = {'Monday': 'ืฉื ื', 'Tuesday': 'ืฉืืืฉื', 'Wednesday': 'ืจืืืขื', 'Thursday': 'ืืืืฉื', |
|
|
'Friday': 'ืฉืืฉื', 'Saturday': 'ืฉืืช', 'Sunday': 'ืจืืฉืื'} |
|
|
col_info = f"โข {col} (ืืงืกื): ืืื ืืฉืืืข ืฉืื ื ืืชื ืืืฉืื (ืืื ืืืืช)\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืคืฉืจ ื ืืชืื ืืคื ืืื ืืฉืืืข, ืืืืื ืืืื ืืขืืืชืืื, ืชืื ืื ืืื ืืื, ืืืืืื ืืคืืกืื ืฉืืืขืืื\n" |
|
|
col_info += f" - ืขืจืืื ืืคืฉืจืืื: Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday\n" |
|
|
col_info += f" - ืืืืืืืช: {', '.join(unique_days[:3])}\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE DayOfWeek = 'Monday', GROUP BY DayOfWeek, SELECT DayOfWeek, COUNT(*) GROUP BY DayOfWeek\n" |
|
|
|
|
|
|
|
|
elif col == 'Hour': |
|
|
hour_min = int(self.df[col].min()) |
|
|
hour_max = int(self.df[col].max()) |
|
|
col_info = f"โข {col} (ืืกืคืจ ืฉืื 0-23): ืฉืขื ืืืื ืฉืื ื ืืชื ืืืฉืื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืคืฉืจ ื ืืชืื ืืคื ืฉืขืืช ืืืื, ืืืืื ืฉืขืืช ืฉืื, ืชืื ืื ืืืื ืืช ืฉืืจืืช, ืืืืืื ืืคืืกืื ืืืืืื\n" |
|
|
col_info += f" - ืืืื: {hour_min} - {hour_max} (0=ืืฆืืช, 12=ืฆืืจืืื, 23=23:00)\n" |
|
|
col_info += f" - ืืืืืืืช: {', '.join([str(s) for s in samples[:3]])}\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE Hour >= 9 AND Hour <= 17 (ืฉืขืืช ืขืืืื), GROUP BY Hour, SELECT Hour, COUNT(*) GROUP BY Hour ORDER BY Hour\n" |
|
|
|
|
|
|
|
|
elif col == 'DayOrNight': |
|
|
unique_values = get_unique_values(col, 5) |
|
|
col_info = f"โข {col} (ืืงืกื): ืืื ืืืฉืื ื ืืชื ืืฉืขืืช ืืืื ืื ืืืืื\n" |
|
|
col_info += f" - ืืฉืืขืืช ืขืกืงืืช: ืืืคืฉืจ ื ืืชืื ืืคื ืฉืขืืช ืคืขืืืืช, ืืืืื ืืืืืื ืืื ืืื ืืืืื, ืชืื ืื ืืืื ืืช ืฉืืจืืช\n" |
|
|
col_info += f" - ืขืจืืื ืืคืฉืจืืื: 'ืืื' ืื 'ืืืื' (ืืื=6:00-18:00, ืืืื=18:00-6:00)\n" |
|
|
col_info += f" - ืืืืืืืช: {', '.join(unique_values)}\n" |
|
|
col_info += f" - ืฉืืืืฉ ืืฉืืืืชืืช: WHERE DayOrNight = 'ืืื', GROUP BY DayOrNight, SELECT DayOrNight, COUNT(*) GROUP BY DayOrNight\n" |
|
|
|
|
|
|
|
|
else: |
|
|
if dtype in ['int64', 'float64']: |
|
|
val_min = self.df[col].min() |
|
|
val_max = self.df[col].max() |
|
|
col_info = f"โข {col} ({dtype}): ืืกืคืจ. ืืืื: {val_min} - {val_max}\n" |
|
|
else: |
|
|
unique_count = self.df[col].nunique() |
|
|
col_info = f"โข {col} ({dtype}): ืืงืกื. {unique_count} ืขืจืืื ืืืืืืืื\n" |
|
|
col_info += f" - ืืืืืืืช: {', '.join(samples[:2])}\n" |
|
|
|
|
|
all_columns_info += col_info + "\n" |
|
|
|
|
|
|
|
|
total_records = len(self.df) |
|
|
unique_services = self.df['ServiceName'].nunique() if 'ServiceName' in self.df.columns else 0 |
|
|
avg_level = self.df['Level'].mean() if 'Level' in self.df.columns else 0 |
|
|
|
|
|
schema_info = f"""ืฉื ืืืืื: Feedback_transformed (ืขื ืืืช ืืืืื F - ืืืื!) |
|
|
|
|
|
ืกืืืืกืืืงืืช ืืืืืืช: |
|
|
- ืกื ืืื ืืฉืืืื: {total_records} |
|
|
- ืืกืคืจ ืฉืืจืืชืื ืืืืืืืื: {unique_services} |
|
|
- ืืืจืื ืืืืฆืข: {avg_level:.2f} |
|
|
|
|
|
ืฉืืืช ืืืืื: |
|
|
{all_columns_info} |
|
|
|
|
|
โ ๏ธ ืืฉืื: ืื ืฉืืืืชื ืืืืืช ืืืชืืื ื-SELECT ืืืืฉืชืืฉ ื-FROM Feedback_transformed! |
|
|
""" |
|
|
return schema_info |
|
|
|
|
|
def analyze_query(self, query: str) -> AnalysisResult: |
|
|
""" |
|
|
Main analysis pipeline: |
|
|
1. Analyze user query |
|
|
2. Generate SQL queries |
|
|
3. Execute SQL queries |
|
|
4. Synthesize answer |
|
|
""" |
|
|
print(f"๐ Analyzing query: {query}", flush=True) |
|
|
|
|
|
if self.df is None: |
|
|
error_msg = "No feedback data available. Please ensure feedback_transformed_2.csv exists in 0_preprocessing/ directory." |
|
|
print(f"โ {error_msg}", flush=True) |
|
|
raise ValueError(error_msg) |
|
|
|
|
|
print(f"โ
Data loaded: {len(self.df)} rows", flush=True) |
|
|
|
|
|
|
|
|
if not settings.gemini_api_key and not settings.openai_api_key: |
|
|
error_msg = "โ No API keys configured! Please set GEMINI_API_KEY or OPENAI_API_KEY in Repository secrets." |
|
|
print(error_msg, flush=True) |
|
|
return AnalysisResult( |
|
|
user_query=query, |
|
|
sql_queries=[], |
|
|
query_results=[], |
|
|
summary=error_msg, |
|
|
visualizations=None |
|
|
) |
|
|
|
|
|
print(f"โ
API keys available: Gemini={bool(settings.gemini_api_key)}, OpenAI={bool(settings.openai_api_key)}", flush=True) |
|
|
|
|
|
|
|
|
try: |
|
|
print("๐ Generating SQL queries...", flush=True) |
|
|
sql_queries = self._generate_sql_queries(query) |
|
|
print(f"โ
Generated {len(sql_queries)} SQL queries", flush=True) |
|
|
if len(sql_queries) == 0: |
|
|
error_msg = "ืื ื ืืฆืจื ืฉืืืืชืืช SQL. ืืืชืื ืฉืืฉืืื ืื ืืจืืจื ืื ืฉืืฉ ืืขืื ืขื ื-API. ื ืกื ืืฉืืื ืฉืืื ืืืจืช ืื ืืืืง ืืช ื-API keys." |
|
|
print(f"โ {error_msg}", flush=True) |
|
|
return AnalysisResult( |
|
|
user_query=query, |
|
|
sql_queries=[], |
|
|
query_results=[], |
|
|
summary=error_msg, |
|
|
visualizations=None |
|
|
) |
|
|
except ValueError as e: |
|
|
|
|
|
print(f"โ Query validation error: {e}", flush=True) |
|
|
return AnalysisResult( |
|
|
user_query=query, |
|
|
sql_queries=[], |
|
|
query_results=[], |
|
|
summary=str(e), |
|
|
visualizations=None |
|
|
) |
|
|
except Exception as e: |
|
|
error_msg = f"ืฉืืืื ืืืฆืืจืช ืฉืืืืชืืช SQL: {str(e)}. ืืืืง ืืช ื-API keys ืืืืืืืจ ืืืื ืืจื ื." |
|
|
print(f"โ {error_msg}", flush=True) |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return AnalysisResult( |
|
|
user_query=query, |
|
|
sql_queries=[], |
|
|
query_results=[], |
|
|
summary=error_msg, |
|
|
visualizations=None |
|
|
) |
|
|
|
|
|
|
|
|
print("๐ Executing SQL queries...", flush=True) |
|
|
query_results = self._execute_sql_queries(sql_queries) |
|
|
successful_results = [r for r in query_results if not r.error and len(r.result) > 0] |
|
|
print(f"โ
Executed {len(query_results)} queries, {len(successful_results)} successful", flush=True) |
|
|
|
|
|
|
|
|
print("๐ Synthesizing answer...", flush=True) |
|
|
summary = self._synthesize_answer(query, sql_queries, query_results) |
|
|
|
|
|
|
|
|
visualizations = self._generate_visualizations(query_results) |
|
|
|
|
|
return AnalysisResult( |
|
|
user_query=query, |
|
|
sql_queries=sql_queries, |
|
|
query_results=query_results, |
|
|
summary=summary, |
|
|
visualizations=visualizations |
|
|
) |
|
|
|
|
|
def _is_gibberish_query(self, query: str) -> bool: |
|
|
""" |
|
|
Check if the query is gibberish or unintelligible. |
|
|
|
|
|
Returns True if the query appears to be gibberish, False otherwise. |
|
|
""" |
|
|
|
|
|
query_clean = query.strip() |
|
|
|
|
|
|
|
|
if len(query_clean) < 3: |
|
|
return True |
|
|
|
|
|
|
|
|
if not any(c.isalpha() for c in query_clean): |
|
|
return True |
|
|
|
|
|
|
|
|
alpha_count = sum(1 for c in query_clean if c.isalpha()) |
|
|
if alpha_count < len(query_clean) * 0.3: |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def _generate_sql_queries(self, query: str) -> List[str]: |
|
|
""" |
|
|
Use LLM to generate 1-5 SQL queries that will help answer the user's question. |
|
|
""" |
|
|
|
|
|
if self._is_gibberish_query(query): |
|
|
raise ValueError("ืืฉืืื ืื ืืจืืจื. ืื ื ื ืกื ืืช ืืฉืืื ืืฆืืจื ืืืชืจ ืืจืืจื ืืืคืืจืืช ืืขืืจืืช.") |
|
|
|
|
|
schema_info = self._get_schema_info() |
|
|
|
|
|
prompt = f"""ืฆืืจ ืฉืืืืชืืช SQL ืืฉืืื: {query} |
|
|
|
|
|
{schema_info} |
|
|
|
|
|
ืืืืื ืืฉืืืื: |
|
|
1. ืฉื ืืืืื: Feedback_transformed (ืขื ืืืช ืืืืื F) - ืืืื ืืื ืฉืืืืชื! |
|
|
2. ืฉืืืืชืืช ืืื: ืืฉืชืืฉ ืืฉืืืช ืืืืืฉืืื (Year, Month, DayInMonth, DayOfWeek, Hour, DayOrNight) - ืื ืืืจ ืืืื ืื! |
|
|
3. Level: 1-5 (1=ืืจืืข ืืืื, 5=ืืขืืื) - ืืฉืชืืฉ ื-Level ืื ืืชืื ืฉืืืขืืช ืจืฆืื |
|
|
4. Text: ืืืคืืฉ ืขื LIKE '%ืืืื%' - ืื ืืชืื ืชืืื ืืืฉืืืื |
|
|
5. ServiceName: ืฉื ืืฉืืจืืช - ืื ืืชืื ืืคื ืฉืืจืืชืื |
|
|
|
|
|
ืืืืืืืช ืืฉืืืืชืืช: |
|
|
- SELECT ServiceName, AVG(Level) as avg_rating, COUNT(*) as count FROM Feedback_transformed GROUP BY ServiceName ORDER BY avg_rating DESC |
|
|
- SELECT COUNT(*) FROM Feedback_transformed WHERE Level >= 4 |
|
|
- SELECT Year, COUNT(*) as count FROM Feedback_transformed GROUP BY Year ORDER BY Year |
|
|
- SELECT DayOfWeek, COUNT(*) as count FROM Feedback_transformed GROUP BY DayOfWeek |
|
|
- SELECT ServiceName, Level, COUNT(*) as count FROM Feedback_transformed GROUP BY ServiceName, Level |
|
|
|
|
|
ืคืืจืื ืืชืฉืืื - JSON ืืืื: |
|
|
{{ |
|
|
"queries": [ |
|
|
"SELECT ... FROM Feedback_transformed ...", |
|
|
"SELECT ... FROM Feedback_transformed ..." |
|
|
] |
|
|
}} |
|
|
|
|
|
ืชื ืจืง ืืช ื-JSON, ืืื ืืงืกื ื ืืกืฃ.""" |
|
|
|
|
|
|
|
|
if settings.gemini_api_key and genai is not None: |
|
|
try: |
|
|
print("๐ Using Gemini API for SQL generation...", flush=True) |
|
|
genai.configure(api_key=settings.gemini_api_key) |
|
|
model = genai.GenerativeModel("gemini-2.0-flash") |
|
|
response = model.generate_content(prompt) |
|
|
text = getattr(response, "text", None) |
|
|
if text: |
|
|
print(f"โ
Received response from Gemini: {text[:200]}...", flush=True) |
|
|
queries = self._parse_sql_queries(text) |
|
|
print(f"โ
Parsed {len(queries)} SQL queries from Gemini response", flush=True) |
|
|
return queries |
|
|
else: |
|
|
print("โ Gemini returned empty response", flush=True) |
|
|
except Exception as e: |
|
|
print(f"โ Gemini error in SQL generation: {e}", flush=True) |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
if settings.openai_api_key and OpenAI is not None: |
|
|
try: |
|
|
print("๐ Using OpenAI API for SQL generation...", flush=True) |
|
|
client = OpenAI(api_key=settings.openai_api_key) |
|
|
response = client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
temperature=0.3, |
|
|
) |
|
|
text = response.choices[0].message.content |
|
|
if text: |
|
|
print(f"โ
Received response from OpenAI: {text[:200]}...", flush=True) |
|
|
queries = self._parse_sql_queries(text) |
|
|
print(f"โ
Parsed {len(queries)} SQL queries from OpenAI response", flush=True) |
|
|
return queries |
|
|
else: |
|
|
print("โ OpenAI returned empty response", flush=True) |
|
|
except Exception as e: |
|
|
print(f"โ OpenAI error in SQL generation: {e}", flush=True) |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
print("โ No API available or all APIs failed. Returning empty query list.", flush=True) |
|
|
return [] |
|
|
|
|
|
def _parse_sql_queries(self, text: str) -> List[str]: |
|
|
""" |
|
|
Parse SQL queries from LLM response text. |
|
|
|
|
|
The LLM is instructed to return JSON, but sometimes it may include |
|
|
markdown formatting or return SQL directly. This function handles |
|
|
multiple formats for robustness. |
|
|
|
|
|
Args: |
|
|
text: Raw text response from LLM (may be JSON, markdown, or plain SQL) |
|
|
|
|
|
Returns: |
|
|
List of SQL query strings, cleaned and validated. |
|
|
Empty list if parsing fails completely. |
|
|
|
|
|
Strategy: |
|
|
1. First, try to parse as JSON (expected format) |
|
|
2. If that fails, try to extract SQL queries using regex |
|
|
3. Return empty list if both methods fail |
|
|
""" |
|
|
|
|
|
try: |
|
|
|
|
|
text = re.sub(r'```json\s*', '', text) |
|
|
text = re.sub(r'```\s*', '', text) |
|
|
text = text.strip() |
|
|
|
|
|
|
|
|
data = json.loads(text) |
|
|
if isinstance(data, dict) and "queries" in data: |
|
|
queries = data["queries"] |
|
|
if isinstance(queries, list): |
|
|
|
|
|
return [q for q in queries if isinstance(q, str) and q.strip()] |
|
|
except Exception: |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
sql_pattern = r'SELECT\s+.*?(?=\n\n|\nSELECT|$)' |
|
|
matches = re.findall(sql_pattern, text, re.IGNORECASE | re.DOTALL) |
|
|
if matches: |
|
|
return [m.strip() for m in matches] |
|
|
|
|
|
|
|
|
|
|
|
return [] |
|
|
|
|
|
def _execute_sql_queries(self, sql_queries: List[str]) -> List[SQLQueryResult]: |
|
|
""" |
|
|
Execute SQL queries on the feedback DataFrame using SQLite in-memory database. |
|
|
|
|
|
This method creates a temporary SQLite database in memory, loads the |
|
|
feedback DataFrame into it, and executes each SQL query. Errors are |
|
|
caught per-query so one failing query doesn't stop the others. |
|
|
|
|
|
Args: |
|
|
sql_queries: List of SQL query strings to execute |
|
|
|
|
|
Returns: |
|
|
List of SQLQueryResult objects, one per query. Each result contains |
|
|
either the query results (DataFrame) or an error message. |
|
|
|
|
|
Implementation details: |
|
|
- Uses SQLite in-memory database (':memory:') for fast execution |
|
|
- DataFrame is loaded into table named 'Feedback_transformed' |
|
|
- Each query is executed independently (errors don't cascade) |
|
|
- Connection is always closed in finally block for safety |
|
|
""" |
|
|
if self.df is None: |
|
|
return [] |
|
|
|
|
|
results = [] |
|
|
|
|
|
|
|
|
|
|
|
conn = sqlite3.connect(':memory:') |
|
|
try: |
|
|
|
|
|
|
|
|
self.df.to_sql('Feedback_transformed', conn, index=False, if_exists='replace') |
|
|
|
|
|
|
|
|
|
|
|
for query in sql_queries: |
|
|
try: |
|
|
|
|
|
result_df = pd.read_sql_query(query, conn) |
|
|
results.append(SQLQueryResult( |
|
|
query=query, |
|
|
result=result_df, |
|
|
error=None |
|
|
)) |
|
|
except Exception as e: |
|
|
|
|
|
results.append(SQLQueryResult( |
|
|
query=query, |
|
|
result=pd.DataFrame(), |
|
|
error=str(e) |
|
|
)) |
|
|
finally: |
|
|
|
|
|
conn.close() |
|
|
|
|
|
return results |
|
|
|
|
|
def _evaluate_answer_quality(self, query: str, answer: str, sql_queries: List[str] = None, query_results: List = None) -> tuple[float, str]: |
|
|
""" |
|
|
Evaluate the quality of an answer using an LLM reviewer. |
|
|
|
|
|
Args: |
|
|
query: The user's original question |
|
|
answer: The synthesized answer to evaluate |
|
|
sql_queries: List of SQL queries that were executed (optional, for context) |
|
|
query_results: Results from executing those queries (optional, for context) |
|
|
|
|
|
Returns: |
|
|
tuple: (score 0-100, feedback/reasoning) |
|
|
""" |
|
|
|
|
|
context_text = "" |
|
|
if sql_queries and query_results: |
|
|
context_text = "\n\nืืฉืืืืชืืช ืฉืืืฆืขื:\n" |
|
|
for i, (q, r) in enumerate(zip(sql_queries, query_results), 1): |
|
|
context_text += f"{i}. {q}\n" |
|
|
if hasattr(r, 'error') and r.error: |
|
|
context_text += f" ืฉืืืื: {r.error}\n" |
|
|
elif hasattr(r, 'result'): |
|
|
context_text += f" ืชืืฆืืืช: {len(r.result) if hasattr(r.result, '__len__') else 'N/A'} ืฉืืจืืช\n" |
|
|
|
|
|
evaluation_prompt = f"""ืืชื ืืืืง ืืืืืช ืชืฉืืืืช ืืงืฆืืขื. ืืขืจื ืืช ืืชืฉืืื ืืืื: |
|
|
|
|
|
ืฉืืืช ืืืฉืชืืฉ ืืืงืืจืืช: {query} |
|
|
{context_text} |
|
|
|
|
|
ืืชืฉืืื ืฉื ืืชื ื: |
|
|
{answer} |
|
|
|
|
|
โ ๏ธ ืืขืจื ืืช ืืชืฉืืื ืืคื ืืงืจืืืจืืื ืื ืืืืื (0-100): |
|
|
1. ืืื ืืชืฉืืื ืขืื ื ืืฉืืจืืช ืขื ืืฉืืื ืืืงืืจืืช? (0-30 ื ืงืืืืช) |
|
|
- ืืื ืืชืฉืืื ืืชืืืืกืช ืืฉืืจืืช ืืฉืืื: {query}? |
|
|
- ืื ืืฉืืื ืืืงืฉืช ืกืืืื/ืืืืงื ืืคื ืฉืืจืืชืื (ServiceName) - ืืื ืืชืฉืืื ืืืืืช ื ืืชืื ื ืคืจื ืืื ืฉืืจืืช? |
|
|
- ืื ืืฉืืื ืืืงืฉืช ืกืืืื/ืืืืงื ืืคื ืืืจืืืื (Level) - ืืื ืืชืฉืืื ืืืืืช ื ืืชืื ื ืคืจื ืืื ืืืจืื? |
|
|
- ืื ืืฉืืื ืืืงืฉืช ืกืืืื/ืืืืงื ืืคื ืชืืจืืืื - ืืื ืืชืฉืืื ืืืืืช ื ืืชืื ื ืคืจื ืืคื ืชืงืืคืืช? |
|
|
- ืืื ืืชืฉืืื ืืื ืชืฉืืื ืืืืืืืช ืืคืืจืืช ืืื ืจืง ืืืืขื ืฉืฉืืืืชืืช ืืืฆืขื? |
|
|
|
|
|
2. ืืื ืืชืฉืืื ืืืืกืกืช ืขื ืื ืชืื ืื ืืืฉืืืืชืืช? (0-25 ื ืงืืืืช) |
|
|
- ืืื ืืชืฉืืื ืืฉืชืืฉืช ืื ืชืื ืื ืืืฉืืืืชืืช? |
|
|
- ืืื ืืชืฉืืื ืืกืืืจื ืืื ืืฉืืืืชืืช ืขืืืจืืช ืืขื ืืช ืขื ืืฉืืื? |
|
|
- ืืื ืืชืฉืืื ืืืืืช ืืกืคืจืื ืืืืืงืื ืืืชืืฆืืืช? |
|
|
|
|
|
3. ืืื ืืชืฉืืื ืืคืืจืืช ืืืงืืคื? (0-20 ื ืงืืืืช) |
|
|
- ืืื ืืชืฉืืื ืืจืืื ืืืคืืจืืช (ืืคืืืช 400-600 ืืืืื)? |
|
|
- ืืื ืืชืฉืืื ืืืืืช ื ืืชืื ืืขืืืง ืืื ืจืง ืจืฉืืืช ื ืชืื ืื? |
|
|
|
|
|
4. ืืื ืืชืฉืืื ืืจืืจื ืืงืืืจื ืืืช? (0-15 ื ืงืืืืช) |
|
|
- ืืื ืืชืฉืืื ืืชืืื ืืฉืคื ืืจืืจื ืืืืื ืช? |
|
|
- ืืื ืืชืฉืืื ืืืืจืื ืช ืืืื (ืื ืืืืื ืฉื ืืืืื)? |
|
|
|
|
|
5. ืืื ืืชืฉืืื ืืืืืช ืชืืื ืืช ืขืกืงืืืช? (0-10 ื ืงืืืืช) |
|
|
- ืืื ืืชืฉืืื ืืืืืช ืชืืื ืืช ืขื ืชืืืืืื ืืืืืืืืื? |
|
|
- ืืื ืืชืฉืืื ืืืืืช ืืืืฆืืช ืืขืฉืืืช? |
|
|
|
|
|
ืชื ืฆืืื ืืืื (0-100) ืืืกืืจ ืงืฆืจ (2-3 ืืฉืคืืื) ืืื ืืฆืืื ืืื. |
|
|
|
|
|
ืคืืจืื ืืชืฉืืื - JSON ืืืื: |
|
|
{{ |
|
|
"score": <ืืกืคืจ 0-100>, |
|
|
"reasoning": "<ืืกืืจ ืงืฆืจ>" |
|
|
}} |
|
|
|
|
|
ืชื ืจืง ืืช ื-JSON, ืืื ืืงืกื ื ืืกืฃ.""" |
|
|
|
|
|
|
|
|
if settings.gemini_api_key and genai is not None: |
|
|
try: |
|
|
genai.configure(api_key=settings.gemini_api_key) |
|
|
model = genai.GenerativeModel("gemini-2.0-flash") |
|
|
response = model.generate_content(evaluation_prompt) |
|
|
text = getattr(response, "text", None) |
|
|
if text: |
|
|
|
|
|
|
|
|
json_match = re.search(r'\{[^}]+\}', text, re.DOTALL) |
|
|
if json_match: |
|
|
try: |
|
|
data = json.loads(json_match.group()) |
|
|
score = float(data.get('score', 0)) |
|
|
reasoning = data.get('reasoning', '') |
|
|
return score, reasoning |
|
|
except (json.JSONDecodeError, ValueError, KeyError): |
|
|
pass |
|
|
except Exception as e: |
|
|
print(f"Gemini error in evaluation: {e}", flush=True) |
|
|
|
|
|
|
|
|
if settings.openai_api_key and OpenAI is not None: |
|
|
try: |
|
|
client = OpenAI(api_key=settings.openai_api_key) |
|
|
response = client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages=[{"role": "user", "content": evaluation_prompt}], |
|
|
temperature=0.3, |
|
|
) |
|
|
text = response.choices[0].message.content |
|
|
if text: |
|
|
|
|
|
json_match = re.search(r'\{[^}]+\}', text, re.DOTALL) |
|
|
if json_match: |
|
|
try: |
|
|
data = json.loads(json_match.group()) |
|
|
score = float(data.get('score', 0)) |
|
|
reasoning = data.get('reasoning', '') |
|
|
return score, reasoning |
|
|
except (json.JSONDecodeError, ValueError, KeyError): |
|
|
pass |
|
|
except Exception as e: |
|
|
print(f"OpenAI error in evaluation: {e}", flush=True) |
|
|
|
|
|
|
|
|
return 85.0, "ืื ื ืืชื ืืืขืจืื - ืืืืืจ ืฆืืื ืืจืืจืช ืืืื" |
|
|
|
|
|
def _synthesize_answer(self, query: str, sql_queries: List[str], |
|
|
query_results: List[SQLQueryResult], max_retries: int = 2) -> str: |
|
|
""" |
|
|
Use LLM to synthesize a comprehensive answer from: |
|
|
- User query |
|
|
- SQL queries that were executed |
|
|
- Results of those queries |
|
|
|
|
|
Includes quality evaluation and automatic improvement if score < 80. |
|
|
|
|
|
Args: |
|
|
query: The user's original question |
|
|
sql_queries: List of SQL queries that were executed |
|
|
query_results: Results from executing those queries |
|
|
max_retries: Maximum number of retry attempts if quality is low |
|
|
|
|
|
Returns: |
|
|
Final synthesized answer |
|
|
""" |
|
|
|
|
|
results_text = "" |
|
|
for i, qr in enumerate(query_results, 1): |
|
|
results_text += f"\nืฉืืืืชื {i}:\n{qr.query}\n\n" |
|
|
if qr.error: |
|
|
results_text += f"ืฉืืืื: {qr.error}\n\n" |
|
|
else: |
|
|
|
|
|
if len(qr.result) == 0: |
|
|
results_text += "ืชืืฆืื: ืืื ืชืืฆืืืช\n\n" |
|
|
else: |
|
|
results_text += f"ืชืืฆืื ({len(qr.result)} ืฉืืจืืช):\n" |
|
|
results_text += qr.result.to_string(index=False) |
|
|
results_text += "\n\n" |
|
|
|
|
|
prompt = f"""ืืชื ืื ืืืกื ื ืชืื ืื. ืืืฉืชืืฉ ืฉืื ืฉืืื ืขื ืืฉืืื ืืฉืชืืฉืื. |
|
|
|
|
|
ืฉืืืช ืืืฉืชืืฉ: {query} |
|
|
|
|
|
ืชืืฆืืืช ืืฉืืืืชืืช: |
|
|
{results_text} |
|
|
|
|
|
ืืชืื ืชืฉืืื ืงืฆืจื, ืืืืืงืช ืืืกืืืจืช ืฉืืืืกืกืช ืืฉืืจืืช ืขื ืืชืืฆืืืช. |
|
|
|
|
|
โ ๏ธ ืืืืื: |
|
|
1. ืขื ื ืืฉืืจืืช ืขื ืืฉืืื - ืื ืืืชืจ, ืื ืคืืืช |
|
|
2. ืืฉืชืืฉ ืืืกืคืจืื ืืืืืืงืื ืืืชืืฆืืืช |
|
|
3. ืคืกืงืืืช ืงืฆืจืืช (2-3 ืืฉืคืืื ืื ืืืช) |
|
|
4. ืื ืืฉ ืกืืืื - ืจืฉืื ืื ืงืืืืจืื ืขื ืืืกืคืจ ืฉืื |
|
|
5. ืขืืจืืช ืคืฉืืื ืืืจืืจื |
|
|
|
|
|
ืืื ื: |
|
|
- ืคืกืงื ืจืืฉืื ื: ืชืฉืืื ืืฉืืจื ืืฉืืื ืขื ืืืกืคืจืื ืืขืืงืจืืื |
|
|
- ืคืกืงืืืช ื ืืกืคืืช: ืคืืจืื ืืคื ืืฆืืจื (ืื ืืฉ ืกืืืื - ืื ืงืืืืจืื ืื ืคืจื) |
|
|
- ืืฉืคื ืกืืืื: ืืืกืงื ื ืืขืืงืจืืช |
|
|
|
|
|
โ ๏ธ ืืฉืื: ืชืฉืืื ืงืฆืจื ืืืืืืงืช. ืื ืคืืืืกืืคืื, ืื ืืกืืจืื ืืจืืืื - ืจืง ืื ืชืื ืื ืืืืกืงื ืืช. |
|
|
|
|
|
ืื ืืฉ ืฉืืืืืช ืืฉืืืืชืืช, ืฆืืื ืืืช ืืงืฆืจื.""" |
|
|
|
|
|
|
|
|
if settings.gemini_api_key and genai is not None: |
|
|
try: |
|
|
genai.configure(api_key=settings.gemini_api_key) |
|
|
model = genai.GenerativeModel("gemini-2.0-flash") |
|
|
generation_config = { |
|
|
"temperature": 0.8, |
|
|
"top_p": 0.95, |
|
|
"top_k": 40, |
|
|
"max_output_tokens": 4000, |
|
|
} |
|
|
response = model.generate_content(prompt, generation_config=generation_config) |
|
|
text = getattr(response, "text", None) |
|
|
if text and text.strip(): |
|
|
answer = text.strip() |
|
|
|
|
|
|
|
|
score, reasoning = self._evaluate_answer_quality(query, answer, sql_queries, query_results) |
|
|
print(f"Answer quality score: {score:.1f}/100 - {reasoning}", flush=True) |
|
|
|
|
|
|
|
|
if score < 80 and max_retries > 0: |
|
|
print(f"Answer quality below threshold (80). Attempting improvement...", flush=True) |
|
|
improvement_prompt = f"""ืืชืฉืืื ืืงืืืืช ืงืืืื ืฆืืื {score}/100. ืืกืืื: {reasoning} |
|
|
|
|
|
ืฉืืืช ืืืฉืชืืฉ: {query} |
|
|
|
|
|
ืืชืฉืืื ืืงืืืืช (ืฉืฆืจืื ืืฉืคืจ): |
|
|
{answer} |
|
|
|
|
|
ืชืืฆืืืช ืืฉืืืืชืืช: |
|
|
{results_text} |
|
|
|
|
|
ืืชืื ืชืฉืืื ืืฉืืคืจืช - ืงืฆืจื, ืืืืืงืช ืืืกืืืจืช. |
|
|
|
|
|
โ ๏ธ ืืืืื: |
|
|
1. ืขื ื ืืฉืืจืืช ืขื ืืฉืืื - ืื ืืืชืจ, ืื ืคืืืช |
|
|
2. ืืฉืชืืฉ ืืืกืคืจืื ืืืืืืงืื ืืืชืืฆืืืช |
|
|
3. ืคืกืงืืืช ืงืฆืจืืช (2-3 ืืฉืคืืื ืื ืืืช) |
|
|
4. ืื ืืฉ ืกืืืื - ืจืฉืื ืื ืงืืืืจืื ืขื ืืืกืคืจ ืฉืื |
|
|
5. ืขืืจืืช ืคืฉืืื ืืืจืืจื |
|
|
|
|
|
ืืื ื: |
|
|
- ืคืกืงื ืจืืฉืื ื: ืชืฉืืื ืืฉืืจื ืืฉืืื ืขื ืืืกืคืจืื ืืขืืงืจืืื |
|
|
- ืคืกืงืืืช ื ืืกืคืืช: ืคืืจืื ืืคื ืืฆืืจื (ืื ืืฉ ืกืืืื - ืื ืงืืืืจืื ืื ืคืจื) |
|
|
- ืืฉืคื ืกืืืื: ืืืกืงื ื ืืขืืงืจืืช |
|
|
|
|
|
โ ๏ธ ืืฉืื: ืชืฉืืื ืงืฆืจื ืืืืืืงืช. ืื ืคืืืืกืืคืื, ืื ืืกืืจืื ืืจืืืื - ืจืง ืื ืชืื ืื ืืืืกืงื ืืช.""" |
|
|
|
|
|
try: |
|
|
response = model.generate_content(improvement_prompt, generation_config=generation_config) |
|
|
improved_text = getattr(response, "text", None) |
|
|
if improved_text and improved_text.strip(): |
|
|
|
|
|
improved_score, improved_reasoning = self._evaluate_answer_quality(query, improved_text.strip(), sql_queries, query_results) |
|
|
print(f"Improved answer quality score: {improved_score:.1f}/100 - {improved_reasoning}", flush=True) |
|
|
if improved_score > score: |
|
|
return improved_text.strip() |
|
|
except Exception as e: |
|
|
print(f"Error improving answer: {e}", flush=True) |
|
|
|
|
|
return answer |
|
|
except Exception as e: |
|
|
print(f"Gemini error in synthesis: {e}", flush=True) |
|
|
|
|
|
|
|
|
if settings.openai_api_key and OpenAI is not None: |
|
|
try: |
|
|
client = OpenAI(api_key=settings.openai_api_key) |
|
|
response = client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
temperature=0.8, |
|
|
max_tokens=3000, |
|
|
) |
|
|
text = response.choices[0].message.content |
|
|
if text and text.strip(): |
|
|
answer = text.strip() |
|
|
|
|
|
|
|
|
score, reasoning = self._evaluate_answer_quality(query, answer, sql_queries, query_results) |
|
|
print(f"Answer quality score: {score:.1f}/100 - {reasoning}", flush=True) |
|
|
|
|
|
|
|
|
if score < 80 and max_retries > 0: |
|
|
print(f"Answer quality below threshold (80). Attempting improvement...", flush=True) |
|
|
improvement_prompt = f"""ืืชืฉืืื ืืงืืืืช ืงืืืื ืฆืืื {score}/100. ืืกืืื: {reasoning} |
|
|
|
|
|
ืฉืืืช ืืืฉืชืืฉ: {query} |
|
|
|
|
|
ืืชืฉืืื ืืงืืืืช (ืฉืฆืจืื ืืฉืคืจ): |
|
|
{answer} |
|
|
|
|
|
ืชืืฆืืืช ืืฉืืืืชืืช: |
|
|
{results_text} |
|
|
|
|
|
ืืชืื ืชืฉืืื ืืฉืืคืจืช - ืงืฆืจื, ืืืืืงืช ืืืกืืืจืช. |
|
|
|
|
|
โ ๏ธ ืืืืื: |
|
|
1. ืขื ื ืืฉืืจืืช ืขื ืืฉืืื - ืื ืืืชืจ, ืื ืคืืืช |
|
|
2. ืืฉืชืืฉ ืืืกืคืจืื ืืืืืืงืื ืืืชืืฆืืืช |
|
|
3. ืคืกืงืืืช ืงืฆืจืืช (2-3 ืืฉืคืืื ืื ืืืช) |
|
|
4. ืื ืืฉ ืกืืืื - ืจืฉืื ืื ืงืืืืจืื ืขื ืืืกืคืจ ืฉืื |
|
|
5. ืขืืจืืช ืคืฉืืื ืืืจืืจื |
|
|
|
|
|
ืืื ื: |
|
|
- ืคืกืงื ืจืืฉืื ื: ืชืฉืืื ืืฉืืจื ืืฉืืื ืขื ืืืกืคืจืื ืืขืืงืจืืื |
|
|
- ืคืกืงืืืช ื ืืกืคืืช: ืคืืจืื ืืคื ืืฆืืจื (ืื ืืฉ ืกืืืื - ืื ืงืืืืจืื ืื ืคืจื) |
|
|
- ืืฉืคื ืกืืืื: ืืืกืงื ื ืืขืืงืจืืช |
|
|
|
|
|
โ ๏ธ ืืฉืื: ืชืฉืืื ืงืฆืจื ืืืืืืงืช. ืื ืคืืืืกืืคืื, ืื ืืกืืจืื ืืจืืืื - ืจืง ืื ืชืื ืื ืืืืกืงื ืืช.""" |
|
|
|
|
|
try: |
|
|
response = client.chat.completions.create( |
|
|
model="gpt-4o-mini", |
|
|
messages=[{"role": "user", "content": improvement_prompt}], |
|
|
temperature=0.8, |
|
|
max_tokens=3000, |
|
|
) |
|
|
improved_text = response.choices[0].message.content |
|
|
if improved_text and improved_text.strip(): |
|
|
|
|
|
improved_score, improved_reasoning = self._evaluate_answer_quality(query, improved_text.strip(), sql_queries, query_results) |
|
|
print(f"Improved answer quality score: {improved_score:.1f}/100 - {improved_reasoning}", flush=True) |
|
|
if improved_score > score: |
|
|
return improved_text.strip() |
|
|
except Exception as e: |
|
|
print(f"Error improving answer: {e}", flush=True) |
|
|
|
|
|
return answer |
|
|
except Exception as e: |
|
|
print(f"OpenAI error in synthesis: {e}", flush=True) |
|
|
|
|
|
|
|
|
|
|
|
successful_results = [r for r in query_results if not r.error and len(r.result) > 0] |
|
|
failed_results = [r for r in query_results if r.error] |
|
|
|
|
|
if len(sql_queries) == 0: |
|
|
return "ืื ื ืืฆืจื ืฉืืืืชืืช SQL. ืืืชืื ืฉืืฉืืื ืื ืืจืืจื ืื ืฉืืฉ ืืขืื ืขื ื-API. ื ืกื ืืฉืืื ืฉืืื ืืืจืช ืื ืืืืง ืืช ื-API keys ื-Repository secrets." |
|
|
|
|
|
if successful_results: |
|
|
fallback_text = f"ืกืืืื ืืคืืจื ืฉื ืืืืฆืืื:\n\n" |
|
|
fallback_text += f"ืืืฆืขื {len(sql_queries)} ืฉืืืืชืืช, ืืชืืื {len(successful_results)} ืืฆืืืื ืืืืืืจื ืชืืฆืืืช.\n" |
|
|
if failed_results: |
|
|
fallback_text += f"โ ๏ธ {len(failed_results)} ืฉืืืืชืืช ื ืืฉืื.\n" |
|
|
fallback_text += "\n" |
|
|
|
|
|
|
|
|
for i, qr in enumerate(successful_results, 1): |
|
|
fallback_text += f"ืืืฆืืื ืืฉืืืืชื {i}:\n" |
|
|
fallback_text += f"ืฉืืืืชื: {qr.query}\n" |
|
|
fallback_text += f"ืืกืคืจ ืจืฉืืืืช: {len(qr.result)}\n\n" |
|
|
|
|
|
|
|
|
if len(qr.result) > 0: |
|
|
fallback_text += "ืชืืฆืืืช:\n" |
|
|
|
|
|
numeric_cols = qr.result.select_dtypes(include=['number']).columns |
|
|
if len(numeric_cols) > 0: |
|
|
fallback_text += "ืกืืืืกืืืงืืช:\n" |
|
|
for col in numeric_cols[:3]: |
|
|
fallback_text += f"- {col}: ืืืืฆืข {qr.result[col].mean():.2f}, ืกืืื {qr.result[col].sum():.0f}\n" |
|
|
fallback_text += "\n" |
|
|
|
|
|
|
|
|
fallback_text += "ืืืืืืืช ืืื ืชืื ืื:\n" |
|
|
fallback_text += qr.result.head(5).to_string(index=False) |
|
|
fallback_text += "\n\n" |
|
|
|
|
|
if failed_results: |
|
|
fallback_text += "\nืฉืืืืืช ืืฉืืืืชืืช:\n" |
|
|
for i, qr in enumerate(failed_results, 1): |
|
|
fallback_text += f"ืฉืืืืชื {i}: {qr.error}\n" |
|
|
fallback_text += "\n" |
|
|
|
|
|
fallback_text += "ืืขืจื: ืชืฉืืื ืื ื ืืฆืจื ืืืืืืืืช ืืืชืืฆืืืช. ืื ืืชืื ืืคืืจื ืืืชืจ, ื ืกื ืืฉืืื ืฉืืื ืกืคืฆืืคืืช ืืืชืจ." |
|
|
return fallback_text |
|
|
else: |
|
|
|
|
|
error_details = "" |
|
|
if failed_results: |
|
|
error_details = "\n\nืฉืืืืืช ืืฉืืืืชืืช:\n" |
|
|
for i, qr in enumerate(failed_results, 1): |
|
|
error_details += f"ืฉืืืืชื {i}: {qr.query}\n" |
|
|
error_details += f"ืฉืืืื: {qr.error}\n\n" |
|
|
|
|
|
return f"ืืืฆืขื {len(sql_queries)} ืฉืืืืชืืช, ืื ืื ืืชืงืืื ืชืืฆืืืช ืืื ืชืื ืื.{error_details}\nืืืชืื ืฉืื ืชืื ืื ืื ืืืืืื ืืืืข ืืชืืื ืืฉืืื ืฉื ืฉืืื. ื ืกื ืืฉืืื ืฉืืื ืืืจืช ืื ืืืืืง ืืช ืื ืชืื ืื ืืืืื ืื." |
|
|
|
|
|
def _generate_visualizations(self, query_results: List[SQLQueryResult]) -> Optional[List[Dict[str, Any]]]: |
|
|
""" |
|
|
Generate visualization specifications for query results. |
|
|
|
|
|
This function analyzes the structure of query results and automatically |
|
|
determines the best visualization type (bar, line, scatter, histogram). |
|
|
The specifications are returned as dictionaries that the frontend can |
|
|
use with Chart.js to render the visualizations. |
|
|
|
|
|
Args: |
|
|
query_results: List of SQL query results to visualize |
|
|
|
|
|
Returns: |
|
|
List of visualization specification dictionaries, or None if no |
|
|
visualizations can be generated. Each dict contains: |
|
|
- type: Chart type (bar, line, scatter, histogram) |
|
|
- title: Display title |
|
|
- x, y: Column names for axes |
|
|
- data: The actual data to visualize |
|
|
|
|
|
Visualization selection logic: |
|
|
- 2 columns: bar chart (categorical + numeric) or line chart (time series) |
|
|
- 1 column: histogram (if numeric) |
|
|
- 3+ columns: bar chart (first categorical + first numeric) |
|
|
""" |
|
|
visualizations = [] |
|
|
|
|
|
for i, qr in enumerate(query_results, 1): |
|
|
|
|
|
if qr.error or len(qr.result) == 0: |
|
|
continue |
|
|
|
|
|
|
|
|
result = qr.result |
|
|
|
|
|
|
|
|
if len(result.columns) == 2: |
|
|
col1, col2 = result.columns |
|
|
|
|
|
if result[col2].dtype in ['int64', 'float64']: |
|
|
|
|
|
if 'date' in col1.lower() or 'time' in col1.lower() or 'ืชืืจืื' in col1.lower(): |
|
|
visualizations.append({ |
|
|
"type": "line", |
|
|
"title": f"ืชืืฆืื ืฉื ืฉืืืืชื {i}", |
|
|
"x": col1, |
|
|
"y": col2, |
|
|
"x_label": col1, |
|
|
"y_label": col2, |
|
|
"data": result.to_dict('records') |
|
|
}) |
|
|
else: |
|
|
|
|
|
visualizations.append({ |
|
|
"type": "bar", |
|
|
"title": f"ืชืืฆืื ืฉื ืฉืืืืชื {i}", |
|
|
"x": col1, |
|
|
"y": col2, |
|
|
"x_label": col1, |
|
|
"y_label": col2, |
|
|
"data": result.to_dict('records') |
|
|
}) |
|
|
|
|
|
elif result[col1].dtype in ['int64', 'float64'] and result[col2].dtype in ['int64', 'float64']: |
|
|
visualizations.append({ |
|
|
"type": "scatter", |
|
|
"title": f"ืชืืฆืื ืฉื ืฉืืืืชื {i}", |
|
|
"x": col1, |
|
|
"y": col2, |
|
|
"x_label": col1, |
|
|
"y_label": col2, |
|
|
"data": result.to_dict('records') |
|
|
}) |
|
|
|
|
|
|
|
|
elif len(result.columns) == 1: |
|
|
col = result.columns[0] |
|
|
if result[col].dtype in ['int64', 'float64']: |
|
|
visualizations.append({ |
|
|
"type": "histogram", |
|
|
"title": f"ืชืืฆืื ืฉื ืฉืืืืชื {i}", |
|
|
"x": col, |
|
|
"x_label": col, |
|
|
"data": result[col].tolist() |
|
|
}) |
|
|
|
|
|
|
|
|
elif len(result.columns) >= 3: |
|
|
|
|
|
numeric_cols = [c for c in result.columns if result[c].dtype in ['int64', 'float64']] |
|
|
categorical_cols = [c for c in result.columns if result[c].dtype == 'object'] |
|
|
|
|
|
|
|
|
if len(categorical_cols) >= 1 and len(numeric_cols) >= 1: |
|
|
cat_col = categorical_cols[0] |
|
|
num_col = numeric_cols[0] |
|
|
visualizations.append({ |
|
|
"type": "bar", |
|
|
"title": f"ืชืืฆืื ืฉื ืฉืืืืชื {i}", |
|
|
"x": cat_col, |
|
|
"y": num_col, |
|
|
"x_label": cat_col, |
|
|
"y_label": num_col, |
|
|
"data": result.to_dict('records') |
|
|
}) |
|
|
|
|
|
return visualizations if visualizations else None |
|
|
|
|
|
|