Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,17 +4,151 @@ import re
|
|
| 4 |
import numpy as np
|
| 5 |
from typing import List, Dict, Any
|
| 6 |
|
|
|
|
| 7 |
# Load and clean the dataset
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Clean brand and model columns
|
| 11 |
df["brand"] = df["brand"].str.strip().str.lower()
|
| 12 |
df["model"] = df["model"].str.strip()
|
| 13 |
-
df["features"] = df["features"].astype(str).str.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Control long responses
|
| 16 |
MAX_TOTAL_CHARACTERS = 5000
|
| 17 |
|
|
|
|
|
|
|
| 18 |
def extract_numbers(text: str) -> List[float]:
|
| 19 |
"""Extract all numbers from text"""
|
| 20 |
return [float(x) for x in re.findall(r'\d+\.?\d*', text)]
|
|
@@ -22,271 +156,429 @@ def extract_numbers(text: str) -> List[float]:
|
|
| 22 |
def find_brand_mentions(query: str) -> List[str]:
|
| 23 |
"""Find all brand mentions in query"""
|
| 24 |
unique_brands = df["brand"].unique()
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def find_model_mentions(query: str) -> List[str]:
|
| 28 |
"""Find all model mentions in query"""
|
| 29 |
unique_models = df["model"].str.lower().unique()
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def extract_price_range(query: str) -> tuple:
|
| 33 |
"""Extract price range from query"""
|
| 34 |
min_price, max_price = None, None
|
|
|
|
| 35 |
|
| 36 |
-
# Pattern for "under X", "below X", "less than X"
|
| 37 |
-
under_match = re.search(r'(?:under|below|less than|up to)\s*₹?(\d
|
| 38 |
if under_match:
|
| 39 |
max_price = float(under_match.group(1))
|
| 40 |
-
|
| 41 |
-
# Pattern for "above X", "more than X", "at least X"
|
| 42 |
-
above_match = re.search(r'(?:above|more than|at least|over)\s*₹?(\d
|
| 43 |
if above_match:
|
| 44 |
min_price = float(above_match.group(1))
|
| 45 |
-
|
| 46 |
# Pattern for "between X and Y"
|
| 47 |
-
between_match = re.search(r'between\s*₹?(\d
|
| 48 |
if between_match:
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
| 52 |
# Pattern for "around X", "approximately X"
|
| 53 |
-
around_match = re.search(r'(?:around|approximately|about)\s*₹?(\d
|
| 54 |
if around_match:
|
| 55 |
target = float(around_match.group(1))
|
| 56 |
-
min_price = target
|
| 57 |
-
max_price = target
|
| 58 |
-
|
| 59 |
return min_price, max_price
|
| 60 |
|
| 61 |
def extract_mileage_range(query: str) -> tuple:
|
| 62 |
"""Extract mileage requirements from query"""
|
| 63 |
min_mileage, max_mileage = None, None
|
|
|
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
has_mileage_context = any(keyword in query.lower() for keyword in mileage_keywords)
|
| 68 |
|
| 69 |
if has_mileage_context:
|
| 70 |
# Pattern for "above X kmpl", "more than X kmpl"
|
| 71 |
-
above_match = re.search(r'(?:above|more than|at least|over)\s*(\d
|
| 72 |
if above_match:
|
| 73 |
min_mileage = float(above_match.group(1))
|
| 74 |
-
|
| 75 |
# Pattern for "below X kmpl", "under X kmpl"
|
| 76 |
-
below_match = re.search(r'(?:below|under|less than)\s*(\d
|
| 77 |
if below_match:
|
| 78 |
max_mileage = float(below_match.group(1))
|
| 79 |
-
|
| 80 |
return min_mileage, max_mileage
|
| 81 |
|
| 82 |
-
def
|
| 83 |
-
"""Extract feature requirements from query"""
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
"
|
| 89 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
]
|
| 91 |
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
def get_comparison_cars(query: str) -> List[Dict]:
|
| 95 |
-
"""Handle comparison queries"""
|
| 96 |
-
|
| 97 |
-
comparison_words = ['vs', 'versus', 'compare', 'comparison', 'better', 'best']
|
| 98 |
-
if not any(word in query.lower() for word in comparison_words):
|
| 99 |
-
return []
|
| 100 |
-
|
| 101 |
-
brands = find_brand_mentions(query)
|
| 102 |
-
models = find_model_mentions(query)
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
return []
|
| 112 |
|
| 113 |
def handle_specific_questions(query: str) -> str:
|
| 114 |
-
"""Handle specific question types"""
|
| 115 |
query_lower = query.lower()
|
| 116 |
|
| 117 |
-
# Price questions
|
| 118 |
if any(word in query_lower for word in ['cheapest', 'lowest price', 'most affordable']):
|
| 119 |
-
cheapest = df.loc[df['
|
| 120 |
-
return f"💰
|
| 121 |
|
| 122 |
if any(word in query_lower for word in ['most expensive', 'highest price', 'premium']):
|
| 123 |
-
expensive = df.loc[df['
|
| 124 |
-
return f"💎
|
| 125 |
|
| 126 |
-
# Mileage questions
|
| 127 |
if any(word in query_lower for word in ['best mileage', 'highest mileage', 'most fuel efficient']):
|
| 128 |
-
best_mileage = df.loc[df['
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
if any(word in query_lower for word in ['worst mileage', 'lowest mileage', 'least fuel efficient']):
|
| 132 |
-
worst_mileage = df.loc[df['
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
# Count questions
|
| 136 |
if any(word in query_lower for word in ['how many', 'count', 'number of']):
|
| 137 |
-
|
| 138 |
-
|
|
|
|
| 139 |
count = len(df[df['brand'] == brand])
|
| 140 |
-
return f"📊 {brand.title()} has {count}
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
# Average questions
|
| 145 |
if 'average' in query_lower:
|
| 146 |
if 'price' in query_lower:
|
| 147 |
-
avg_price = df['
|
| 148 |
-
return f"📊
|
| 149 |
-
elif 'mileage' in query_lower:
|
| 150 |
-
avg_mileage = df['
|
| 151 |
-
return f"📊
|
| 152 |
|
| 153 |
-
# Brand-specific
|
| 154 |
brands = find_brand_mentions(query)
|
| 155 |
-
if brands and any(word in query_lower for word in ['models', 'variants', 'options']):
|
| 156 |
brand = brands[0]
|
| 157 |
brand_cars = df[df['brand'] == brand]
|
| 158 |
models = brand_cars['model'].unique()
|
| 159 |
-
return f"🚗 {brand.title()} models: {', '.join(models)}"
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
return ""
|
| 162 |
|
| 163 |
def format_car_details(car: Dict, show_features: bool = True, compact: bool = False) -> str:
|
| 164 |
-
"""Format car details for display"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
if compact:
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
| 169 |
features_text = ""
|
| 170 |
-
if show_features and 'features' in car:
|
| 171 |
-
features = car['features']
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
-
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
def answer_question(query: str) -> str:
|
| 181 |
if not query.strip():
|
| 182 |
return "❓ Please ask me something about Indian cars!"
|
| 183 |
|
| 184 |
query = query.strip()
|
| 185 |
-
|
| 186 |
-
|
|
|
|
| 187 |
specific_answer = handle_specific_questions(query)
|
| 188 |
if specific_answer:
|
| 189 |
return specific_answer
|
| 190 |
|
| 191 |
-
# Handle comparisons
|
| 192 |
comparison_cars = get_comparison_cars(query)
|
| 193 |
if comparison_cars:
|
| 194 |
-
response = "📊
|
| 195 |
-
for car in comparison_cars
|
| 196 |
-
response += format_car_details(car, show_features=
|
|
|
|
|
|
|
|
|
|
| 197 |
return response.strip()
|
| 198 |
|
| 199 |
-
# Check for specific car mention (
|
|
|
|
| 200 |
for _, row in df.iterrows():
|
| 201 |
car_name = f"{row['brand']} {row['model']}".lower()
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
-
#
|
| 206 |
filtered_df = df.copy()
|
| 207 |
|
| 208 |
# Filter by brand
|
| 209 |
brands = find_brand_mentions(query)
|
| 210 |
if brands:
|
| 211 |
filtered_df = filtered_df[filtered_df["brand"].isin(brands)]
|
| 212 |
-
|
| 213 |
-
# Filter by model
|
| 214 |
models = find_model_mentions(query)
|
| 215 |
if models:
|
| 216 |
filtered_df = filtered_df[filtered_df["model"].str.lower().isin(models)]
|
| 217 |
-
|
| 218 |
# Filter by price
|
| 219 |
min_price, max_price = extract_price_range(query)
|
| 220 |
if min_price is not None:
|
| 221 |
-
filtered_df = filtered_df[filtered_df["
|
| 222 |
if max_price is not None:
|
| 223 |
-
filtered_df = filtered_df[filtered_df["
|
| 224 |
-
|
| 225 |
-
# Filter by mileage
|
| 226 |
min_mileage, max_mileage = extract_mileage_range(query)
|
| 227 |
if min_mileage is not None:
|
| 228 |
-
filtered_df = filtered_df[filtered_df["
|
| 229 |
if max_mileage is not None:
|
| 230 |
-
filtered_df = filtered_df[filtered_df["
|
| 231 |
-
|
| 232 |
-
# Filter by
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
if
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
filtered_df = filtered_df.
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
if filtered_df.empty:
|
| 247 |
-
return "❌ No matching cars found for your query.
|
| 248 |
-
|
| 249 |
response = ""
|
| 250 |
|
| 251 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
is_simple_brand_query = (
|
| 253 |
-
len(brands) == 1 and
|
| 254 |
-
not models and
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
not
|
| 258 |
-
not any(
|
|
|
|
| 259 |
)
|
| 260 |
-
|
| 261 |
-
if is_simple_brand_query
|
| 262 |
-
|
| 263 |
-
response += f"🏷️ All {brands[0].title()} cars in our database ({len(filtered_df)} models):\n\n"
|
| 264 |
for _, row in filtered_df.iterrows():
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
-
# Add a summary
|
| 268 |
-
avg_price = filtered_df['
|
| 269 |
-
avg_mileage = filtered_df['
|
| 270 |
-
price_range = f"₹{filtered_df['
|
| 271 |
-
response += f"\n📊 Summary:
|
| 272 |
|
| 273 |
else:
|
| 274 |
-
#
|
| 275 |
-
if
|
| 276 |
-
|
| 277 |
|
| 278 |
-
#
|
| 279 |
-
max_detailed_cars = 8 if
|
| 280 |
|
| 281 |
for _, row in filtered_df.head(max_detailed_cars).iterrows():
|
| 282 |
-
entry = format_car_details(row.to_dict()) + "\n"
|
| 283 |
if len(response + entry) > MAX_TOTAL_CHARACTERS:
|
|
|
|
| 284 |
break
|
| 285 |
response += entry
|
| 286 |
-
|
| 287 |
-
if len(
|
| 288 |
-
|
| 289 |
-
|
| 290 |
return response.strip()
|
| 291 |
|
| 292 |
# Enhanced Gradio interface
|
|
@@ -298,9 +590,12 @@ examples = [
|
|
| 298 |
"Best mileage car under 10 lakhs",
|
| 299 |
"Mahindra cars with price and mileage",
|
| 300 |
"Cars between 5 and 15 lakhs",
|
| 301 |
-
"Which car has the best features?",
|
| 302 |
"Show me all Honda models",
|
| 303 |
-
"Average price of cars in database"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
]
|
| 305 |
|
| 306 |
gr.Interface(
|
|
@@ -315,7 +610,7 @@ gr.Interface(
|
|
| 315 |
label="Car Information"
|
| 316 |
),
|
| 317 |
title="🚘 Enhanced Indian Car AI Assistant",
|
| 318 |
-
description="Ask me anything about Indian cars! I can help with comparisons, recommendations, specifications, and more.",
|
| 319 |
examples=examples,
|
| 320 |
theme="soft"
|
| 321 |
).launch()
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
from typing import List, Dict, Any
|
| 6 |
|
| 7 |
+
# --- Data Loading and Initial Cleaning ---
|
| 8 |
# Load and clean the dataset
|
| 9 |
+
# Assuming indian_car_info.csv is properly formatted CSV.
|
| 10 |
+
# If your CSV is literally just lines of text without proper CSV quoting for features,
|
| 11 |
+
# you'd need a more complex custom parser than pd.read_csv.
|
| 12 |
+
# Given the snippet, it looks like features are quoted, which pd.read_csv handles.
|
| 13 |
+
try:
|
| 14 |
+
df = pd.read_csv("indian_car_info.csv")
|
| 15 |
+
except FileNotFoundError:
|
| 16 |
+
print("Error: indian_car_info.csv not found. Please ensure the file is in the same directory.")
|
| 17 |
+
exit()
|
| 18 |
|
| 19 |
# Clean brand and model columns
|
| 20 |
df["brand"] = df["brand"].str.strip().str.lower()
|
| 21 |
df["model"] = df["model"].str.strip()
|
| 22 |
+
df["features"] = df["features"].astype(str).str.lower() # Ensure features are string and lowercased
|
| 23 |
+
|
| 24 |
+
# --- Advanced Data Preprocessing: Parsing Ranges and Inferring Categories ---
|
| 25 |
+
|
| 26 |
+
def parse_price_mileage_range(value_str: Any) -> tuple:
|
| 27 |
+
"""
|
| 28 |
+
Parses a string like '24.8-25.75' or '452 km range' or '6.49' into (min_val, max_val).
|
| 29 |
+
Returns (np.nan, np.nan) if parsing fails.
|
| 30 |
+
"""
|
| 31 |
+
if pd.isna(value_str):
|
| 32 |
+
return np.nan, np.nan
|
| 33 |
+
|
| 34 |
+
value_str = str(value_str).lower().replace('₹', '').replace('lakh', '').strip()
|
| 35 |
+
|
| 36 |
+
# Handle EV ranges (e.g., "452 km range")
|
| 37 |
+
if 'km range' in value_str:
|
| 38 |
+
try:
|
| 39 |
+
val = float(re.search(r'(\d+\.?\d*)', value_str).group(1))
|
| 40 |
+
return val, val
|
| 41 |
+
except (AttributeError, ValueError):
|
| 42 |
+
return np.nan, np.nan
|
| 43 |
+
|
| 44 |
+
# Handle "Expected ~18-20" or "Expected ~10-15"
|
| 45 |
+
if 'expected' in value_str:
|
| 46 |
+
nums = re.findall(r'\d+\.?\d*', value_str)
|
| 47 |
+
if len(nums) == 2:
|
| 48 |
+
try:
|
| 49 |
+
return float(nums[0]), float(nums[1])
|
| 50 |
+
except ValueError:
|
| 51 |
+
return np.nan, np.nan
|
| 52 |
+
elif len(nums) == 1: # Single expected value
|
| 53 |
+
try:
|
| 54 |
+
return float(nums[0]), float(nums[0])
|
| 55 |
+
except ValueError:
|
| 56 |
+
return np.nan, np.nan
|
| 57 |
+
return np.nan, np.nan
|
| 58 |
+
|
| 59 |
+
# Handle numeric ranges (e.g., "24.8-25.75", "6.49-9.64")
|
| 60 |
+
if '-' in value_str:
|
| 61 |
+
try:
|
| 62 |
+
parts = [float(p.strip()) for p in value_str.split('-')]
|
| 63 |
+
return min(parts), max(parts)
|
| 64 |
+
except ValueError:
|
| 65 |
+
return np.nan, np.nan
|
| 66 |
+
else: # Single numeric value (e.g., "23.27", "12.1")
|
| 67 |
+
try:
|
| 68 |
+
val = float(value_str)
|
| 69 |
+
return val, val
|
| 70 |
+
except ValueError:
|
| 71 |
+
return np.nan, np.nan
|
| 72 |
+
|
| 73 |
+
# Apply the parsing function to create min/max columns
|
| 74 |
+
df[['mileage_kmpl_min', 'mileage_kmpl_max']] = df['mileage_kmpl'].apply(lambda x: pd.Series(parse_price_mileage_range(x)))
|
| 75 |
+
df[['price_lakh_min', 'price_lakh_max']] = df['price_lakh'].apply(lambda x: pd.Series(parse_price_mileage_range(x)))
|
| 76 |
+
|
| 77 |
+
# Infer new columns from 'engine' and 'features' for better filtering
|
| 78 |
+
def infer_car_attributes(row: pd.Series) -> pd.Series:
|
| 79 |
+
engine = row['engine'].lower()
|
| 80 |
+
features = row['features'].lower()
|
| 81 |
+
|
| 82 |
+
# Fuel Type
|
| 83 |
+
fuel_type = 'petrol' # Default
|
| 84 |
+
if 'diesel' in engine or 'diesel' in features:
|
| 85 |
+
fuel_type = 'diesel'
|
| 86 |
+
elif 'cng' in engine or 'cng' in features:
|
| 87 |
+
fuel_type = 'cng'
|
| 88 |
+
elif 'electric' in engine or 'electric' in features or 'motor' in engine: # Covers Permanent Magnet Synchronous Motor
|
| 89 |
+
fuel_type = 'electric'
|
| 90 |
+
elif 'hybrid' in engine or 'hybrid' in features:
|
| 91 |
+
fuel_type = 'hybrid' # Can be strong or mild, more specific parsing needed for differentiation
|
| 92 |
+
|
| 93 |
+
# Transmission
|
| 94 |
+
transmission = 'manual' # Default
|
| 95 |
+
if 'automatic' in features or 'at' in features or 'amt' in features or 'dct' in features or 'cvt' in features:
|
| 96 |
+
transmission = 'automatic'
|
| 97 |
+
elif 'paddle shifters' in features: # Usually implies AT
|
| 98 |
+
transmission = 'automatic'
|
| 99 |
+
elif 'manual' in features or 'mt' in features: # Explicitly manual
|
| 100 |
+
transmission = 'manual' # Manual overrides automatic if both are mentioned, pick one convention.
|
| 101 |
+
# For cars with both (e.g., different variants), this will just pick one.
|
| 102 |
+
# A more complex model would store both or create specific flags.
|
| 103 |
+
|
| 104 |
+
# Seating Capacity
|
| 105 |
+
seating_capacity = np.nan
|
| 106 |
+
seat_match = re.search(r'(\d+)-seater', features)
|
| 107 |
+
if seat_match:
|
| 108 |
+
seating_capacity = int(seat_match.group(1))
|
| 109 |
+
elif '6/7 seater' in features: # Common pattern
|
| 110 |
+
seating_capacity = 7 # Assume 7 for flexibility, or you can pick 6
|
| 111 |
+
elif 'true 7-seater' in features:
|
| 112 |
+
seating_capacity = 7
|
| 113 |
+
elif 'modular 7-seater' in features:
|
| 114 |
+
seating_capacity = 7
|
| 115 |
+
elif '5 seater' in features: # Specific for 5 seater
|
| 116 |
+
seating_capacity = 5
|
| 117 |
+
|
| 118 |
+
# Body Type (more complex, using keywords)
|
| 119 |
+
body_type = 'other'
|
| 120 |
+
if 'suv' in features or 'suv' in row['model'].lower():
|
| 121 |
+
body_type = 'suv'
|
| 122 |
+
elif 'sedan' in features or 'sedan' in row['model'].lower():
|
| 123 |
+
body_type = 'sedan'
|
| 124 |
+
elif 'hatchback' in features or 'hatchback' in row['model'].lower():
|
| 125 |
+
body_type = 'hatchback'
|
| 126 |
+
elif 'muv' in features:
|
| 127 |
+
body_type = 'muv'
|
| 128 |
+
elif 'pickup truck' in features:
|
| 129 |
+
body_type = 'pickup'
|
| 130 |
+
|
| 131 |
+
return pd.Series({
|
| 132 |
+
'fuel_type': fuel_type,
|
| 133 |
+
'transmission': transmission,
|
| 134 |
+
'seating_capacity': seating_capacity,
|
| 135 |
+
'body_type': body_type
|
| 136 |
+
})
|
| 137 |
+
|
| 138 |
+
# Apply attribute inference to the DataFrame
|
| 139 |
+
df = df.assign(**df.apply(infer_car_attributes, axis=1).to_dict('list'))
|
| 140 |
+
|
| 141 |
+
# Convert numeric columns to appropriate types, coercing errors to NaN
|
| 142 |
+
numeric_cols = ['mileage_kmpl_min', 'mileage_kmpl_max', 'price_lakh_min', 'price_lakh_max', 'seating_capacity']
|
| 143 |
+
for col in numeric_cols:
|
| 144 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 145 |
+
|
| 146 |
|
| 147 |
# Control long responses
|
| 148 |
MAX_TOTAL_CHARACTERS = 5000
|
| 149 |
|
| 150 |
+
# --- Helper Functions for Query Parsing ---
|
| 151 |
+
|
| 152 |
def extract_numbers(text: str) -> List[float]:
|
| 153 |
"""Extract all numbers from text"""
|
| 154 |
return [float(x) for x in re.findall(r'\d+\.?\d*', text)]
|
|
|
|
| 156 |
def find_brand_mentions(query: str) -> List[str]:
|
| 157 |
"""Find all brand mentions in query"""
|
| 158 |
unique_brands = df["brand"].unique()
|
| 159 |
+
# Use a more specific regex to avoid partial matches and prefer full words
|
| 160 |
+
found_brands = []
|
| 161 |
+
for brand in unique_brands:
|
| 162 |
+
if re.search(r'\b' + re.escape(brand) + r'\b', query.lower()):
|
| 163 |
+
found_brands.append(brand)
|
| 164 |
+
return found_brands
|
| 165 |
|
| 166 |
def find_model_mentions(query: str) -> List[str]:
|
| 167 |
"""Find all model mentions in query"""
|
| 168 |
unique_models = df["model"].str.lower().unique()
|
| 169 |
+
found_models = []
|
| 170 |
+
for model in unique_models:
|
| 171 |
+
if re.search(r'\b' + re.escape(model) + r'\b', query.lower()):
|
| 172 |
+
found_models.append(model)
|
| 173 |
+
return found_models
|
| 174 |
|
| 175 |
def extract_price_range(query: str) -> tuple:
|
| 176 |
"""Extract price range from query"""
|
| 177 |
min_price, max_price = None, None
|
| 178 |
+
query = query.lower()
|
| 179 |
|
| 180 |
+
# Pattern for "under X", "below X", "less than X", "up to X"
|
| 181 |
+
under_match = re.search(r'(?:under|below|less than|up to)\s*₹?(\d+\.?\d*)', query)
|
| 182 |
if under_match:
|
| 183 |
max_price = float(under_match.group(1))
|
| 184 |
+
|
| 185 |
+
# Pattern for "above X", "more than X", "at least X", "over X"
|
| 186 |
+
above_match = re.search(r'(?:above|more than|at least|over)\s*₹?(\d+\.?\d*)', query)
|
| 187 |
if above_match:
|
| 188 |
min_price = float(above_match.group(1))
|
| 189 |
+
|
| 190 |
# Pattern for "between X and Y"
|
| 191 |
+
between_match = re.search(r'between\s*₹?(\d+\.?\d*)\s*(?:and|to)\s*₹?(\d+\.?\d*)', query)
|
| 192 |
if between_match:
|
| 193 |
+
p1 = float(between_match.group(1))
|
| 194 |
+
p2 = float(between_match.group(2))
|
| 195 |
+
min_price = min(p1, p2)
|
| 196 |
+
max_price = max(p1, p2)
|
| 197 |
+
|
| 198 |
# Pattern for "around X", "approximately X"
|
| 199 |
+
around_match = re.search(r'(?:around|approximately|about)\s*₹?(\d+\.?\d*)', query)
|
| 200 |
if around_match:
|
| 201 |
target = float(around_match.group(1))
|
| 202 |
+
min_price = target * 0.8 # +/- 20% tolerance for "around"
|
| 203 |
+
max_price = target * 1.2
|
| 204 |
+
|
| 205 |
return min_price, max_price
|
| 206 |
|
| 207 |
def extract_mileage_range(query: str) -> tuple:
|
| 208 |
"""Extract mileage requirements from query"""
|
| 209 |
min_mileage, max_mileage = None, None
|
| 210 |
+
query = query.lower()
|
| 211 |
|
| 212 |
+
mileage_keywords = ['mileage', 'fuel efficiency', 'kmpl', 'fuel economy', 'range']
|
| 213 |
+
has_mileage_context = any(keyword in query for keyword in mileage_keywords)
|
|
|
|
| 214 |
|
| 215 |
if has_mileage_context:
|
| 216 |
# Pattern for "above X kmpl", "more than X kmpl"
|
| 217 |
+
above_match = re.search(r'(?:above|more than|at least|over)\s*(\d+\.?\d*)(?:\s*kmpl|\s*km range)?', query)
|
| 218 |
if above_match:
|
| 219 |
min_mileage = float(above_match.group(1))
|
| 220 |
+
|
| 221 |
# Pattern for "below X kmpl", "under X kmpl"
|
| 222 |
+
below_match = re.search(r'(?:below|under|less than)\s*(\d+\.?\d*)(?:\s*kmpl|\s*km range)?', query)
|
| 223 |
if below_match:
|
| 224 |
max_mileage = float(below_match.group(1))
|
| 225 |
+
|
| 226 |
return min_mileage, max_mileage
|
| 227 |
|
| 228 |
+
def extract_features_from_query(query: str) -> List[str]:
|
| 229 |
+
"""Extract general feature requirements from query that map to the 'features' column."""
|
| 230 |
+
query = query.lower()
|
| 231 |
+
# Expanded list of relevant keywords to search in the 'features' column
|
| 232 |
+
# Ensure these are distinct from attributes derived into separate columns
|
| 233 |
+
general_feature_keywords = [
|
| 234 |
+
"sunroof", "panoramic sunroof", "360-degree camera", "head-up display",
|
| 235 |
+
"hud", "wireless charging", "ambient lighting", "cruise control",
|
| 236 |
+
"rear ac vents", "push start/stop", "electrically adjustable orvms",
|
| 237 |
+
"automatic climate control", "digital speedometer", "dual airbags",
|
| 238 |
+
"abs", "ebd", "reverse parking sensors", "connected car tech", "bluelink",
|
| 239 |
+
"adas", "hyundai smartsense", "honda sensing", "ventilated front seats",
|
| 240 |
+
"dual zone climate control", "electronic parking brake", "auto hold",
|
| 241 |
+
"apple carplay", "android auto", "bose premium sound", "jbl sound system",
|
| 242 |
+
"powered tailgate", "terrain response modes", "digital instrument cluster",
|
| 243 |
+
"air purifier", "traction pro mode", "dca automatic", "voice assistant",
|
| 244 |
+
"paddle shifters", "allgrip awd", "4x4", "ladder frame chassis", "hill hold assist",
|
| 245 |
+
"hill descent control", "ventilated front cup holders", "smart hybrid technology",
|
| 246 |
+
"uv-cut glass", "boosterjet engine", "ADAS", "i-smart", "digital bluetooth key",
|
| 247 |
+
"heated orvms", "personal ai assistant", "ultrafast charging", "v2l",
|
| 248 |
+
"sliding center console", "panoramic vision roof", "meridian premium sound",
|
| 249 |
+
"augmented reality hud", "multi-terrain modes", "multi-mode regen", "frunk",
|
| 250 |
+
"voice-enabled sunroof", "dashcam", "footwell lighting", "heated seats",
|
| 251 |
+
"powered driver seat", "triple-zone climate control", "differential lock",
|
| 252 |
+
"active traction control", "dac", "cooler box", "lane watch camera",
|
| 253 |
+
"multi-sense driving modes", "easyfix seats", "detachable 3rd row",
|
| 254 |
+
"digital cockpit", "4motion all-wheel drive", "park assist", "citroen advanced comfort",
|
| 255 |
+
"customization options", "high ground clearance", "comfortable ride",
|
| 256 |
+
"roof mounted rear ac vents", "true 7-seater", "washable interior",
|
| 257 |
+
"tyre pressure monitoring system", "mld technology", "micro hybrid technology",
|
| 258 |
+
"static bending headlamps", "robust build quality", "harmon infotainment",
|
| 259 |
+
"corner stability control", "rain sensing wipers", "automatic headlamps",
|
| 260 |
+
"multi-zone climate control"
|
| 261 |
]
|
| 262 |
|
| 263 |
+
matched_features = []
|
| 264 |
+
for keyword in general_feature_keywords:
|
| 265 |
+
if keyword in query:
|
| 266 |
+
matched_features.append(keyword)
|
| 267 |
+
return matched_features
|
| 268 |
|
| 269 |
def get_comparison_cars(query: str) -> List[Dict]:
|
| 270 |
+
"""Handle comparison queries by identifying car names mentioned."""
|
| 271 |
+
query_lower = query.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
# Keywords indicating a comparison
|
| 274 |
+
comparison_words = ['vs', 'versus', 'compare', 'comparison', 'better', 'which is', 'difference between']
|
| 275 |
+
if not any(word in query_lower for word in comparison_words):
|
| 276 |
+
return []
|
| 277 |
+
|
| 278 |
+
# Attempt to find two distinct car models for comparison
|
| 279 |
+
car_names_in_query = []
|
| 280 |
+
# Create a list of all possible full car names (brand + model) and model names
|
| 281 |
+
all_car_ids = df['brand'] + ' ' + df['model']
|
| 282 |
+
all_models = df['model']
|
| 283 |
+
|
| 284 |
+
# Prioritize full car names, then standalone models if they are unique enough
|
| 285 |
+
for i, full_name in all_car_ids.items():
|
| 286 |
+
if full_name.lower() in query_lower and full_name.lower() not in [c['full_name'] for c in car_names_in_query]:
|
| 287 |
+
car_names_in_query.append({'type': 'full', 'name': df.loc[i, 'model'].lower(), 'brand': df.loc[i, 'brand'].lower(), 'full_name': full_name.lower()})
|
| 288 |
+
|
| 289 |
+
# If we found at least two specific cars, proceed
|
| 290 |
+
if len(car_names_in_query) >= 2:
|
| 291 |
+
# Filter df to get details of these specific cars
|
| 292 |
+
compared_models = [car['name'] for car in car_names_in_query]
|
| 293 |
+
return df[df["model"].str.lower().isin(compared_models)].to_dict('records')
|
| 294 |
|
| 295 |
return []
|
| 296 |
|
| 297 |
def handle_specific_questions(query: str) -> str:
|
| 298 |
+
"""Handle specific question types like 'cheapest', 'best mileage', 'how many', 'average'."""
|
| 299 |
query_lower = query.lower()
|
| 300 |
|
| 301 |
+
# Price questions (using _min columns)
|
| 302 |
if any(word in query_lower for word in ['cheapest', 'lowest price', 'most affordable']):
|
| 303 |
+
cheapest = df.loc[df['price_lakh_min'].idxmin()]
|
| 304 |
+
return f"💰 The cheapest car in our database is the {cheapest['brand'].title()} {cheapest['model']} at ₹{cheapest['price_lakh_min']:.2f} Lakh."
|
| 305 |
|
| 306 |
if any(word in query_lower for word in ['most expensive', 'highest price', 'premium']):
|
| 307 |
+
expensive = df.loc[df['price_lakh_min'].idxmax()]
|
| 308 |
+
return f"💎 The most expensive car in our database is the {expensive['brand'].title()} {expensive['model']} at ₹{expensive['price_lakh_min']:.2f} Lakh."
|
| 309 |
|
| 310 |
+
# Mileage questions (using _min columns)
|
| 311 |
if any(word in query_lower for word in ['best mileage', 'highest mileage', 'most fuel efficient']):
|
| 312 |
+
best_mileage = df.loc[df['mileage_kmpl_min'].idxmax()]
|
| 313 |
+
# Check if it's an EV range
|
| 314 |
+
mileage_text = f"{best_mileage['mileage_kmpl_min']} kmpl"
|
| 315 |
+
if 'electric' in best_mileage['fuel_type'] or 'hybrid' in best_mileage['fuel_type']:
|
| 316 |
+
mileage_text = f"{best_mileage['mileage_kmpl_min']} km range (Electric/Hybrid)"
|
| 317 |
+
return f"⛽ The car with the best mileage/range is the {best_mileage['brand'].title()} {best_mileage['model']} with {mileage_text}."
|
| 318 |
|
| 319 |
if any(word in query_lower for word in ['worst mileage', 'lowest mileage', 'least fuel efficient']):
|
| 320 |
+
worst_mileage = df.loc[df['mileage_kmpl_min'].idxmin()]
|
| 321 |
+
mileage_text = f"{worst_mileage['mileage_kmpl_min']} kmpl"
|
| 322 |
+
if 'electric' in worst_mileage['fuel_type'] or 'hybrid' in worst_mileage['fuel_type']:
|
| 323 |
+
mileage_text = f"{worst_mileage['mileage_kmpl_min']} km range (Electric/Hybrid)"
|
| 324 |
+
return f"⛽ The car with the lowest mileage/range is the {worst_mileage['brand'].title()} {worst_mileage['model']} with {mileage_text}."
|
| 325 |
|
| 326 |
# Count questions
|
| 327 |
if any(word in query_lower for word in ['how many', 'count', 'number of']):
|
| 328 |
+
brands = find_brand_mentions(query)
|
| 329 |
+
if brands:
|
| 330 |
+
brand = brands[0]
|
| 331 |
count = len(df[df['brand'] == brand])
|
| 332 |
+
return f"📊 {brand.title()} has {count} car models in our database."
|
| 333 |
+
|
| 334 |
+
fuel_types = ['petrol', 'diesel', 'electric', 'cng', 'hybrid']
|
| 335 |
+
for ft in fuel_types:
|
| 336 |
+
if ft in query_lower and 'car' in query_lower:
|
| 337 |
+
count = len(df[df['fuel_type'] == ft])
|
| 338 |
+
return f"📊 There are {count} {ft.title()} car models in our database."
|
| 339 |
+
|
| 340 |
+
transmissions = ['automatic', 'manual']
|
| 341 |
+
for tr in transmissions:
|
| 342 |
+
if tr in query_lower and 'car' in query_lower:
|
| 343 |
+
count = len(df[df['transmission'] == tr])
|
| 344 |
+
return f"📊 There are {count} {tr.title()} transmission car models in our database."
|
| 345 |
+
|
| 346 |
+
seating_match = re.search(r'(\d+)\s*seat', query_lower)
|
| 347 |
+
if seating_match:
|
| 348 |
+
seats = int(seating_match.group(1))
|
| 349 |
+
count = len(df[df['seating_capacity'] == seats])
|
| 350 |
+
return f"��� There are {count} car models with {seats} seats in our database."
|
| 351 |
+
|
| 352 |
+
return f"📊 Total cars in database: {len(df)}."
|
| 353 |
|
| 354 |
# Average questions
|
| 355 |
if 'average' in query_lower:
|
| 356 |
if 'price' in query_lower:
|
| 357 |
+
avg_price = df['price_lakh_min'].mean()
|
| 358 |
+
return f"📊 The average minimum car price in our database is ₹{avg_price:.2f} Lakh."
|
| 359 |
+
elif 'mileage' in query_lower or 'fuel efficiency' in query_lower:
|
| 360 |
+
avg_mileage = df['mileage_kmpl_min'].mean()
|
| 361 |
+
return f"📊 The average minimum mileage/range in our database is {avg_mileage:.2f} kmpl/km."
|
| 362 |
|
| 363 |
+
# Brand-specific models/variants
|
| 364 |
brands = find_brand_mentions(query)
|
| 365 |
+
if brands and any(word in query_lower for word in ['models', 'variants', 'options', 'cars']):
|
| 366 |
brand = brands[0]
|
| 367 |
brand_cars = df[df['brand'] == brand]
|
| 368 |
models = brand_cars['model'].unique()
|
| 369 |
+
return f"🚗 {brand.title()} has the following models in our database: {', '.join(models)}."
|
| 370 |
|
| 371 |
+
# Handle subjective questions that can't be answered directly
|
| 372 |
+
if 'best features' in query_lower or 'most luxurious' in query_lower or 'most reliable' in query_lower or 'safest' in query_lower:
|
| 373 |
+
return "🤔 That's a great question, but 'best' or 'most luxurious' can be subjective! I can tell you about specific features if you ask, or list cars based on quantifiable criteria like price, mileage, or presence of ADAS/sunroof."
|
| 374 |
+
|
| 375 |
return ""
|
| 376 |
|
| 377 |
def format_car_details(car: Dict, show_features: bool = True, compact: bool = False) -> str:
|
| 378 |
+
"""Format car details for display."""
|
| 379 |
+
mileage_text = f"{car.get('mileage_kmpl_min', 'N/A')} kmpl"
|
| 380 |
+
if 'electric' in str(car.get('fuel_type', '')).lower() or 'hybrid' in str(car.get('fuel_type', '')).lower():
|
| 381 |
+
mileage_text = f"{car.get('mileage_kmpl_min', 'N/A')} km range"
|
| 382 |
+
|
| 383 |
if compact:
|
| 384 |
+
return (
|
| 385 |
+
f"🚗 {car.get('brand', '').title()} {car.get('model', '')} | "
|
| 386 |
+
f"₹{car.get('price_lakh_min', 'N/A')}L | {mileage_text} | "
|
| 387 |
+
f"{car.get('engine', 'N/A')}"
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
features_text = ""
|
| 391 |
+
if show_features and 'features' in car and pd.notna(car['features']):
|
| 392 |
+
features = car['features']
|
| 393 |
+
# Limit features to 200 chars and append "..." if truncated
|
| 394 |
+
display_features = features[:200]
|
| 395 |
+
if len(features) > 200:
|
| 396 |
+
display_features += "..."
|
| 397 |
+
features_text = f"- Key Features: {display_features.title()}\n"
|
| 398 |
+
|
| 399 |
+
details = f"""🚗 {car.get('brand', '').title()} {car.get('model', '')}
|
| 400 |
+
- Engine: {car.get('engine', 'N/A')}
|
| 401 |
+
- Fuel Type: {str(car.get('fuel_type', 'N/A')).title()}
|
| 402 |
+
- Transmission: {str(car.get('transmission', 'N/A')).title()}
|
| 403 |
+
- Mileage/Range: {mileage_text}
|
| 404 |
+
- Price: ₹{car.get('price_lakh_min', 'N/A')}-{car.get('price_lakh_max', 'N/A')} Lakh
|
| 405 |
+
"""
|
| 406 |
+
if pd.notna(car.get('seating_capacity')):
|
| 407 |
+
details += f"- Seating: {int(car['seating_capacity'])}-seater\n"
|
| 408 |
+
if pd.notna(car.get('body_type')):
|
| 409 |
+
details += f"- Body Type: {str(car.get('body_type', 'N/A')).title()}\n"
|
| 410 |
+
|
| 411 |
+
details += features_text
|
| 412 |
+
return details
|
| 413 |
+
|
| 414 |
+
# --- Main Answer Function ---
|
| 415 |
|
| 416 |
def answer_question(query: str) -> str:
|
| 417 |
if not query.strip():
|
| 418 |
return "❓ Please ask me something about Indian cars!"
|
| 419 |
|
| 420 |
query = query.strip()
|
| 421 |
+
query_lower = query.lower() # Use this for all case-insensitive checks
|
| 422 |
+
|
| 423 |
+
# 1. Handle specific questions (e.g., "cheapest car", "how many Maruti cars")
|
| 424 |
specific_answer = handle_specific_questions(query)
|
| 425 |
if specific_answer:
|
| 426 |
return specific_answer
|
| 427 |
|
| 428 |
+
# 2. Handle direct car comparisons (e.g., "Creta vs Seltos")
|
| 429 |
comparison_cars = get_comparison_cars(query)
|
| 430 |
if comparison_cars:
|
| 431 |
+
response = "📊 Here's a comparison of the cars you asked about:\n\n"
|
| 432 |
+
for car in comparison_cars:
|
| 433 |
+
response += format_car_details(car, show_features=True) + "\n"
|
| 434 |
+
if len(response) > MAX_TOTAL_CHARACTERS * 0.8: # Limit comparison length
|
| 435 |
+
response += "\n... (some details truncated for brevity)\n"
|
| 436 |
+
break
|
| 437 |
return response.strip()
|
| 438 |
|
| 439 |
+
# 3. Check for single specific car mention (e.g., "details of Tata Nexon")
|
| 440 |
+
# This should be after comparison to allow "compare Nexon vs Harrier"
|
| 441 |
for _, row in df.iterrows():
|
| 442 |
car_name = f"{row['brand']} {row['model']}".lower()
|
| 443 |
+
# Look for full car name or model name preceded by brand
|
| 444 |
+
if car_name in query_lower or (row['model'].lower() in query_lower and row['brand'].lower() in query_lower):
|
| 445 |
+
# Ensure it's a specific query for details, not a filter
|
| 446 |
+
if any(k in query_lower for k in ['details', 'info', 'specifications', 'tell me about', 'what is the']):
|
| 447 |
+
return f"📌 {row['brand'].title()} {row['model']} Details:\n" + format_car_details(row.to_dict())
|
| 448 |
|
| 449 |
+
# 4. General Filtering based on Criteria
|
| 450 |
filtered_df = df.copy()
|
| 451 |
|
| 452 |
# Filter by brand
|
| 453 |
brands = find_brand_mentions(query)
|
| 454 |
if brands:
|
| 455 |
filtered_df = filtered_df[filtered_df["brand"].isin(brands)]
|
| 456 |
+
|
| 457 |
+
# Filter by model (if specific models are requested alongside other filters)
|
| 458 |
models = find_model_mentions(query)
|
| 459 |
if models:
|
| 460 |
filtered_df = filtered_df[filtered_df["model"].str.lower().isin(models)]
|
| 461 |
+
|
| 462 |
# Filter by price
|
| 463 |
min_price, max_price = extract_price_range(query)
|
| 464 |
if min_price is not None:
|
| 465 |
+
filtered_df = filtered_df[filtered_df["price_lakh_min"] >= min_price]
|
| 466 |
if max_price is not None:
|
| 467 |
+
filtered_df = filtered_df[filtered_df["price_lakh_max"] <= max_price] # Use max_price_max for upper bound
|
| 468 |
+
|
| 469 |
+
# Filter by mileage/range
|
| 470 |
min_mileage, max_mileage = extract_mileage_range(query)
|
| 471 |
if min_mileage is not None:
|
| 472 |
+
filtered_df = filtered_df[filtered_df["mileage_kmpl_min"] >= min_mileage]
|
| 473 |
if max_mileage is not None:
|
| 474 |
+
filtered_df = filtered_df[filtered_df["mileage_kmpl_max"] <= max_mileage] # Use mileage_kmpl_max for upper bound
|
| 475 |
+
|
| 476 |
+
# Filter by fuel type
|
| 477 |
+
fuel_types_in_query = []
|
| 478 |
+
if 'petrol' in query_lower: fuel_types_in_query.append('petrol')
|
| 479 |
+
if 'diesel' in query_lower: fuel_types_in_query.append('diesel')
|
| 480 |
+
if 'electric' in query_lower or 'ev' in query_lower: fuel_types_in_query.append('electric')
|
| 481 |
+
if 'cng' in query_lower: fuel_types_in_query.append('cng')
|
| 482 |
+
if 'hybrid' in query_lower: fuel_types_in_query.append('hybrid') # Captures both strong and mild
|
| 483 |
+
|
| 484 |
+
if fuel_types_in_query:
|
| 485 |
+
filtered_df = filtered_df[filtered_df['fuel_type'].isin(fuel_types_in_query)]
|
| 486 |
+
|
| 487 |
+
# Filter by transmission
|
| 488 |
+
transmission_types_in_query = []
|
| 489 |
+
if 'automatic' in query_lower or 'auto' in query_lower: transmission_types_in_query.append('automatic')
|
| 490 |
+
if 'manual' in query_lower: transmission_types_in_query.append('manual')
|
| 491 |
+
|
| 492 |
+
if transmission_types_in_query:
|
| 493 |
+
filtered_df = filtered_df[filtered_df['transmission'].isin(transmission_types_in_query)]
|
| 494 |
+
|
| 495 |
+
# Filter by seating capacity
|
| 496 |
+
seating_match = re.search(r'(\d+)\s*seater', query_lower)
|
| 497 |
+
if seating_match:
|
| 498 |
+
seats = int(seating_match.group(1))
|
| 499 |
+
# Allow for "5-seater" queries to include vehicles that *can* be 5-seater
|
| 500 |
+
filtered_df = filtered_df[filtered_df['seating_capacity'] == seats]
|
| 501 |
+
|
| 502 |
+
# Filter by body type
|
| 503 |
+
body_types_in_query = []
|
| 504 |
+
if 'suv' in query_lower: body_types_in_query.append('suv')
|
| 505 |
+
if 'sedan' in query_lower: body_types_in_query.append('sedan')
|
| 506 |
+
if 'hatchback' in query_lower: body_types_in_query.append('hatchback')
|
| 507 |
+
if 'muv' in query_lower: body_types_in_query.append('muv')
|
| 508 |
+
if 'pickup' in query_lower: body_types_in_query.append('pickup')
|
| 509 |
+
|
| 510 |
+
if body_types_in_query:
|
| 511 |
+
filtered_df = filtered_df[filtered_df['body_type'].isin(body_types_in_query)]
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
# Filter by general features (from 'features' column)
|
| 515 |
+
general_features_from_query = extract_features_from_query(query)
|
| 516 |
+
for feature_keyword in general_features_from_query:
|
| 517 |
+
filtered_df = filtered_df[filtered_df["features"].str.contains(feature_keyword, na=False)]
|
| 518 |
+
|
| 519 |
+
# Final Response Generation
|
| 520 |
if filtered_df.empty:
|
| 521 |
+
return "❌ No matching cars found for your query. Please try adjusting your requirements or asking a broader question."
|
| 522 |
+
|
| 523 |
response = ""
|
| 524 |
|
| 525 |
+
# Sort results based on query intent
|
| 526 |
+
if any(word in query_lower for word in ['cheap', 'affordable', 'budget', 'lowest price']):
|
| 527 |
+
filtered_df = filtered_df.sort_values('price_lakh_min')
|
| 528 |
+
elif any(word in query_lower for word in ['expensive', 'premium', 'luxury', 'highest price']):
|
| 529 |
+
filtered_df = filtered_df.sort_values('price_lakh_min', ascending=False)
|
| 530 |
+
elif any(word in query_lower for word in ['mileage', 'fuel efficient', 'economy', 'best mileage']):
|
| 531 |
+
filtered_df = filtered_df.sort_values('mileage_kmpl_min', ascending=False)
|
| 532 |
+
else: # Default sort
|
| 533 |
+
filtered_df = filtered_df.sort_values('price_lakh_min')
|
| 534 |
+
|
| 535 |
+
# Determine how many cars to show in detail vs. compact list
|
| 536 |
+
total_matches = len(filtered_df)
|
| 537 |
+
|
| 538 |
+
# If a very broad query resulting in many cars from one brand, or just a brand name, show compact list
|
| 539 |
is_simple_brand_query = (
|
| 540 |
+
len(brands) == 1 and total_matches > 5 and
|
| 541 |
+
not models and not min_price and not max_price and
|
| 542 |
+
not min_mileage and not max_mileage and not general_features_from_query and
|
| 543 |
+
not any(ft in query_lower for ft in fuel_types_in_query) and
|
| 544 |
+
not any(tt in query_lower for tt in transmission_types_in_query) and
|
| 545 |
+
not seating_match and not any(bt in query_lower for bt in body_types_in_query) and
|
| 546 |
+
not any(word in query_lower for word in ['cheap', 'expensive', 'best', 'compare', 'vs', 'average', 'count', 'how many'])
|
| 547 |
)
|
| 548 |
+
|
| 549 |
+
if is_simple_brand_query:
|
| 550 |
+
response += f"🏷️ Found {total_matches} {brands[0].title()} models. Here's a summary:\n\n"
|
|
|
|
| 551 |
for _, row in filtered_df.iterrows():
|
| 552 |
+
entry = format_car_details(row.to_dict(), show_features=False, compact=True) + "\n"
|
| 553 |
+
if len(response + entry) > MAX_TOTAL_CHARACTERS * 0.9:
|
| 554 |
+
response += "\n... (further results truncated for length)\n"
|
| 555 |
+
break
|
| 556 |
+
response += entry
|
| 557 |
|
| 558 |
+
# Add a summary for simple brand queries
|
| 559 |
+
avg_price = filtered_df['price_lakh_min'].mean()
|
| 560 |
+
avg_mileage = filtered_df['mileage_kmpl_min'].mean()
|
| 561 |
+
price_range = f"₹{filtered_df['price_lakh_min'].min():.2f}-{filtered_df['price_lakh_max'].max():.2f} Lakh"
|
| 562 |
+
response += f"\n📊 Summary for {brands[0].title()} cars: Avg. Min Price: ₹{avg_price:.2f}L | Avg. Min Mileage/Range: {avg_mileage:.2f} | Price Range: {price_range}"
|
| 563 |
|
| 564 |
else:
|
| 565 |
+
# For more specific filters, show more detailed info, up to a limit
|
| 566 |
+
response_header = f"Found {total_matches} matching cars:\n\n" if total_matches > 1 else "Found 1 matching car:\n\n"
|
| 567 |
+
response += response_header
|
| 568 |
|
| 569 |
+
# Limit detailed results to make response manageable
|
| 570 |
+
max_detailed_cars = 8 if total_matches <= 10 else 5
|
| 571 |
|
| 572 |
for _, row in filtered_df.head(max_detailed_cars).iterrows():
|
| 573 |
+
entry = format_car_details(row.to_dict(), show_features=True) + "\n"
|
| 574 |
if len(response + entry) > MAX_TOTAL_CHARACTERS:
|
| 575 |
+
response += f"\n... and {total_matches - filtered_df.head(max_detailed_cars).shape[0]} more cars match your criteria. Please refine your search."
|
| 576 |
break
|
| 577 |
response += entry
|
| 578 |
+
|
| 579 |
+
if total_matches > max_detailed_cars and len(response) <= MAX_TOTAL_CHARACTERS:
|
| 580 |
+
response += f"\n... and {total_matches - max_detailed_cars} more cars match your criteria."
|
| 581 |
+
|
| 582 |
return response.strip()
|
| 583 |
|
| 584 |
# Enhanced Gradio interface
|
|
|
|
| 590 |
"Best mileage car under 10 lakhs",
|
| 591 |
"Mahindra cars with price and mileage",
|
| 592 |
"Cars between 5 and 15 lakhs",
|
|
|
|
| 593 |
"Show me all Honda models",
|
| 594 |
+
"Average price of cars in database",
|
| 595 |
+
"Electric cars with ADAS",
|
| 596 |
+
"SUVs with 7 seats and good mileage",
|
| 597 |
+
"Diesel cars from Toyota",
|
| 598 |
+
"Tell me about the Skoda Slavia"
|
| 599 |
]
|
| 600 |
|
| 601 |
gr.Interface(
|
|
|
|
| 610 |
label="Car Information"
|
| 611 |
),
|
| 612 |
title="🚘 Enhanced Indian Car AI Assistant",
|
| 613 |
+
description="Ask me anything about Indian cars! I can help with comparisons, recommendations, specifications, and more. Data based on 'indian_car_info.csv'.",
|
| 614 |
examples=examples,
|
| 615 |
theme="soft"
|
| 616 |
).launch()
|