Satwickchikkala1 commited on
Commit
9f566d2
·
verified ·
1 Parent(s): 920f99e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +446 -151
app.py CHANGED
@@ -4,17 +4,151 @@ import re
4
  import numpy as np
5
  from typing import List, Dict, Any
6
 
 
7
  # Load and clean the dataset
8
- df = pd.read_csv("indian_car_info.csv")
 
 
 
 
 
 
 
 
9
 
10
  # Clean brand and model columns
11
  df["brand"] = df["brand"].str.strip().str.lower()
12
  df["model"] = df["model"].str.strip()
13
- df["features"] = df["features"].astype(str).str.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Control long responses
16
  MAX_TOTAL_CHARACTERS = 5000
17
 
 
 
18
  def extract_numbers(text: str) -> List[float]:
19
  """Extract all numbers from text"""
20
  return [float(x) for x in re.findall(r'\d+\.?\d*', text)]
@@ -22,271 +156,429 @@ def extract_numbers(text: str) -> List[float]:
22
  def find_brand_mentions(query: str) -> List[str]:
23
  """Find all brand mentions in query"""
24
  unique_brands = df["brand"].unique()
25
- return [brand for brand in unique_brands if brand in query.lower()]
 
 
 
 
 
26
 
27
  def find_model_mentions(query: str) -> List[str]:
28
  """Find all model mentions in query"""
29
  unique_models = df["model"].str.lower().unique()
30
- return [model for model in unique_models if model.lower() in query.lower()]
 
 
 
 
31
 
32
  def extract_price_range(query: str) -> tuple:
33
  """Extract price range from query"""
34
  min_price, max_price = None, None
 
35
 
36
- # Pattern for "under X", "below X", "less than X"
37
- under_match = re.search(r'(?:under|below|less than|up to)\s*₹?(\d+)', query.lower())
38
  if under_match:
39
  max_price = float(under_match.group(1))
40
-
41
- # Pattern for "above X", "more than X", "at least X"
42
- above_match = re.search(r'(?:above|more than|at least|over)\s*₹?(\d+)', query.lower())
43
  if above_match:
44
  min_price = float(above_match.group(1))
45
-
46
  # Pattern for "between X and Y"
47
- between_match = re.search(r'between\s*₹?(\d+)\s*(?:and|to)\s*₹?(\d+)', query.lower())
48
  if between_match:
49
- min_price = float(between_match.group(1))
50
- max_price = float(between_match.group(2))
51
-
 
 
52
  # Pattern for "around X", "approximately X"
53
- around_match = re.search(r'(?:around|approximately|about)\s*₹?(\d+)', query.lower())
54
  if around_match:
55
  target = float(around_match.group(1))
56
- min_price = target - 2
57
- max_price = target + 2
58
-
59
  return min_price, max_price
60
 
61
  def extract_mileage_range(query: str) -> tuple:
62
  """Extract mileage requirements from query"""
63
  min_mileage, max_mileage = None, None
 
64
 
65
- # Look for mileage-related keywords
66
- mileage_keywords = ['mileage', 'fuel efficiency', 'kmpl', 'fuel economy']
67
- has_mileage_context = any(keyword in query.lower() for keyword in mileage_keywords)
68
 
69
  if has_mileage_context:
70
  # Pattern for "above X kmpl", "more than X kmpl"
71
- above_match = re.search(r'(?:above|more than|at least|over)\s*(\d+)', query.lower())
72
  if above_match:
73
  min_mileage = float(above_match.group(1))
74
-
75
  # Pattern for "below X kmpl", "under X kmpl"
76
- below_match = re.search(r'(?:below|under|less than)\s*(\d+)', query.lower())
77
  if below_match:
78
  max_mileage = float(below_match.group(1))
79
-
80
  return min_mileage, max_mileage
81
 
82
- def extract_features(query: str) -> List[str]:
83
- """Extract feature requirements from query"""
84
- feature_keywords = [
85
- "sunroof", "automatic", "manual", "cruise control", "abs", "airbags",
86
- "android auto", "touchscreen", "rear camera", "parking sensor",
87
- "bluetooth", "usb", "keyless", "push button", "climate control",
88
- "leather seats", "alloy wheels", "fog lights", "power steering",
89
- "power windows", "central locking", "music system", "navigation"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  ]
91
 
92
- return [feat for feat in feature_keywords if feat in query.lower()]
 
 
 
 
93
 
94
  def get_comparison_cars(query: str) -> List[Dict]:
95
- """Handle comparison queries"""
96
- # Look for comparison keywords
97
- comparison_words = ['vs', 'versus', 'compare', 'comparison', 'better', 'best']
98
- if not any(word in query.lower() for word in comparison_words):
99
- return []
100
-
101
- brands = find_brand_mentions(query)
102
- models = find_model_mentions(query)
103
 
104
- if len(brands) >= 2 or len(models) >= 2:
105
- # Return cars for comparison
106
- if models:
107
- return df[df["model"].str.lower().isin(models)].to_dict('records')
108
- else:
109
- return df[df["brand"].isin(brands)].to_dict('records')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  return []
112
 
113
  def handle_specific_questions(query: str) -> str:
114
- """Handle specific question types"""
115
  query_lower = query.lower()
116
 
117
- # Price questions
118
  if any(word in query_lower for word in ['cheapest', 'lowest price', 'most affordable']):
119
- cheapest = df.loc[df['price_lakh'].idxmin()]
120
- return f"💰 Cheapest car: {cheapest['brand'].title()} {cheapest['model']} at ₹{cheapest['price_lakh']} Lakh"
121
 
122
  if any(word in query_lower for word in ['most expensive', 'highest price', 'premium']):
123
- expensive = df.loc[df['price_lakh'].idxmax()]
124
- return f"💎 Most expensive car: {expensive['brand'].title()} {expensive['model']} at ₹{expensive['price_lakh']} Lakh"
125
 
126
- # Mileage questions
127
  if any(word in query_lower for word in ['best mileage', 'highest mileage', 'most fuel efficient']):
128
- best_mileage = df.loc[df['mileage_kmpl'].idxmax()]
129
- return f"⛽ Best mileage car: {best_mileage['brand'].title()} {best_mileage['model']} with {best_mileage['mileage_kmpl']} kmpl"
 
 
 
 
130
 
131
  if any(word in query_lower for word in ['worst mileage', 'lowest mileage', 'least fuel efficient']):
132
- worst_mileage = df.loc[df['mileage_kmpl'].idxmin()]
133
- return f"⛽ Lowest mileage car: {worst_mileage['brand'].title()} {worst_mileage['model']} with {worst_mileage['mileage_kmpl']} kmpl"
 
 
 
134
 
135
  # Count questions
136
  if any(word in query_lower for word in ['how many', 'count', 'number of']):
137
- if any(brand in query_lower for brand in df['brand'].unique()):
138
- brand = next(brand for brand in df['brand'].unique() if brand in query_lower)
 
139
  count = len(df[df['brand'] == brand])
140
- return f"📊 {brand.title()} has {count} cars in our database"
141
- else:
142
- return f"📊 Total cars in database: {len(df)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  # Average questions
145
  if 'average' in query_lower:
146
  if 'price' in query_lower:
147
- avg_price = df['price_lakh'].mean()
148
- return f"📊 Average car price: ₹{avg_price:.2f} Lakh"
149
- elif 'mileage' in query_lower:
150
- avg_mileage = df['mileage_kmpl'].mean()
151
- return f"📊 Average mileage: {avg_mileage:.2f} kmpl"
152
 
153
- # Brand-specific questions
154
  brands = find_brand_mentions(query)
155
- if brands and any(word in query_lower for word in ['models', 'variants', 'options']):
156
  brand = brands[0]
157
  brand_cars = df[df['brand'] == brand]
158
  models = brand_cars['model'].unique()
159
- return f"🚗 {brand.title()} models: {', '.join(models)}"
160
 
 
 
 
 
161
  return ""
162
 
163
  def format_car_details(car: Dict, show_features: bool = True, compact: bool = False) -> str:
164
- """Format car details for display"""
 
 
 
 
165
  if compact:
166
- # Compact format for showing many cars
167
- return f"🚗 {car['brand'].title()} {car['model']} | ₹{car['price_lakh']}L | {car['mileage_kmpl']} kmpl | {car['engine']}"
168
-
 
 
 
169
  features_text = ""
170
- if show_features and 'features' in car:
171
- features = car['features'][:200] + "..." if len(car['features']) > 200 else car['features']
172
- features_text = f"- Features: {features.title()}\n"
173
-
174
- return f"""🚗 {car['brand'].title()} {car['model']}
175
- - Engine: {car['engine']}
176
- - Mileage: {car['mileage_kmpl']} kmpl
177
- - Price: ₹{car['price_lakh']} Lakh
178
- {features_text}"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  def answer_question(query: str) -> str:
181
  if not query.strip():
182
  return "❓ Please ask me something about Indian cars!"
183
 
184
  query = query.strip()
185
-
186
- # Handle specific questions first
 
187
  specific_answer = handle_specific_questions(query)
188
  if specific_answer:
189
  return specific_answer
190
 
191
- # Handle comparisons
192
  comparison_cars = get_comparison_cars(query)
193
  if comparison_cars:
194
- response = "📊 Car Comparison:\n\n"
195
- for car in comparison_cars[:3]: # Limit to 3 cars
196
- response += format_car_details(car, show_features=False) + "\n"
 
 
 
197
  return response.strip()
198
 
199
- # Check for specific car mention (brand + model)
 
200
  for _, row in df.iterrows():
201
  car_name = f"{row['brand']} {row['model']}".lower()
202
- if car_name in query.lower():
203
- return f"📌 {row['brand'].title()} {row['model']} Details:\n" + format_car_details(row.to_dict())
 
 
 
204
 
205
- # Start filtering
206
  filtered_df = df.copy()
207
 
208
  # Filter by brand
209
  brands = find_brand_mentions(query)
210
  if brands:
211
  filtered_df = filtered_df[filtered_df["brand"].isin(brands)]
212
-
213
- # Filter by model
214
  models = find_model_mentions(query)
215
  if models:
216
  filtered_df = filtered_df[filtered_df["model"].str.lower().isin(models)]
217
-
218
  # Filter by price
219
  min_price, max_price = extract_price_range(query)
220
  if min_price is not None:
221
- filtered_df = filtered_df[filtered_df["price_lakh"] >= min_price]
222
  if max_price is not None:
223
- filtered_df = filtered_df[filtered_df["price_lakh"] <= max_price]
224
-
225
- # Filter by mileage
226
  min_mileage, max_mileage = extract_mileage_range(query)
227
  if min_mileage is not None:
228
- filtered_df = filtered_df[filtered_df["mileage_kmpl"] >= min_mileage]
229
  if max_mileage is not None:
230
- filtered_df = filtered_df[filtered_df["mileage_kmpl"] <= max_mileage]
231
-
232
- # Filter by features
233
- features = extract_features(query)
234
- for feature in features:
235
- filtered_df = filtered_df[filtered_df["features"].str.contains(feature, na=False)]
236
-
237
- # Sort results based on query intent
238
- if any(word in query.lower() for word in ['cheap', 'affordable', 'budget']):
239
- filtered_df = filtered_df.sort_values('price_lakh')
240
- elif any(word in query.lower() for word in ['expensive', 'premium', 'luxury']):
241
- filtered_df = filtered_df.sort_values('price_lakh', ascending=False)
242
- elif any(word in query.lower() for word in ['mileage', 'fuel efficient', 'economy']):
243
- filtered_df = filtered_df.sort_values('mileage_kmpl', ascending=False)
244
-
245
- # Generate response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  if filtered_df.empty:
247
- return "❌ No matching cars found for your query. Try adjusting your requirements!"
248
-
249
  response = ""
250
 
251
- # Check if it's a simple brand query (show all cars from that brand)
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  is_simple_brand_query = (
253
- len(brands) == 1 and
254
- not models and
255
- min_price is None and max_price is None and
256
- min_mileage is None and max_mileage is None and
257
- not features and
258
- not any(word in query.lower() for word in ['cheap', 'expensive', 'best', 'compare', 'vs'])
 
259
  )
260
-
261
- if is_simple_brand_query and len(filtered_df) > 3:
262
- # Show all cars for simple brand queries in compact format
263
- response += f"🏷️ All {brands[0].title()} cars in our database ({len(filtered_df)} models):\n\n"
264
  for _, row in filtered_df.iterrows():
265
- response += format_car_details(row.to_dict(), show_features=False, compact=True) + "\n"
 
 
 
 
266
 
267
- # Add a summary
268
- avg_price = filtered_df['price_lakh'].mean()
269
- avg_mileage = filtered_df['mileage_kmpl'].mean()
270
- price_range = f"₹{filtered_df['price_lakh'].min()}-{filtered_df['price_lakh'].max()}L"
271
- response += f"\n📊 Summary: Average price: ₹{avg_price:.1f}L | Average mileage: {avg_mileage:.1f} kmpl | Price range: {price_range}"
272
 
273
  else:
274
- # Regular detailed format for filtered results
275
- if len(filtered_df) > 1:
276
- response += f"Found {len(filtered_df)} matching cars:\n\n"
277
 
278
- # Determine how many cars to show in detail
279
- max_detailed_cars = 8 if len(filtered_df) <= 10 else 5
280
 
281
  for _, row in filtered_df.head(max_detailed_cars).iterrows():
282
- entry = format_car_details(row.to_dict()) + "\n"
283
  if len(response + entry) > MAX_TOTAL_CHARACTERS:
 
284
  break
285
  response += entry
286
-
287
- if len(filtered_df) > max_detailed_cars:
288
- response += f"\n... and {len(filtered_df) - max_detailed_cars} more cars match your criteria."
289
-
290
  return response.strip()
291
 
292
  # Enhanced Gradio interface
@@ -298,9 +590,12 @@ examples = [
298
  "Best mileage car under 10 lakhs",
299
  "Mahindra cars with price and mileage",
300
  "Cars between 5 and 15 lakhs",
301
- "Which car has the best features?",
302
  "Show me all Honda models",
303
- "Average price of cars in database"
 
 
 
 
304
  ]
305
 
306
  gr.Interface(
@@ -315,7 +610,7 @@ gr.Interface(
315
  label="Car Information"
316
  ),
317
  title="🚘 Enhanced Indian Car AI Assistant",
318
- description="Ask me anything about Indian cars! I can help with comparisons, recommendations, specifications, and more.",
319
  examples=examples,
320
  theme="soft"
321
  ).launch()
 
4
  import numpy as np
5
  from typing import List, Dict, Any
6
 
7
+ # --- Data Loading and Initial Cleaning ---
8
  # Load and clean the dataset
9
+ # Assuming indian_car_info.csv is properly formatted CSV.
10
+ # If your CSV is literally just lines of text without proper CSV quoting for features,
11
+ # you'd need a more complex custom parser than pd.read_csv.
12
+ # Given the snippet, it looks like features are quoted, which pd.read_csv handles.
13
+ try:
14
+ df = pd.read_csv("indian_car_info.csv")
15
+ except FileNotFoundError:
16
+ print("Error: indian_car_info.csv not found. Please ensure the file is in the same directory.")
17
+ exit()
18
 
19
  # Clean brand and model columns
20
  df["brand"] = df["brand"].str.strip().str.lower()
21
  df["model"] = df["model"].str.strip()
22
+ df["features"] = df["features"].astype(str).str.lower() # Ensure features are string and lowercased
23
+
24
+ # --- Advanced Data Preprocessing: Parsing Ranges and Inferring Categories ---
25
+
26
+ def parse_price_mileage_range(value_str: Any) -> tuple:
27
+ """
28
+ Parses a string like '24.8-25.75' or '452 km range' or '6.49' into (min_val, max_val).
29
+ Returns (np.nan, np.nan) if parsing fails.
30
+ """
31
+ if pd.isna(value_str):
32
+ return np.nan, np.nan
33
+
34
+ value_str = str(value_str).lower().replace('₹', '').replace('lakh', '').strip()
35
+
36
+ # Handle EV ranges (e.g., "452 km range")
37
+ if 'km range' in value_str:
38
+ try:
39
+ val = float(re.search(r'(\d+\.?\d*)', value_str).group(1))
40
+ return val, val
41
+ except (AttributeError, ValueError):
42
+ return np.nan, np.nan
43
+
44
+ # Handle "Expected ~18-20" or "Expected ~10-15"
45
+ if 'expected' in value_str:
46
+ nums = re.findall(r'\d+\.?\d*', value_str)
47
+ if len(nums) == 2:
48
+ try:
49
+ return float(nums[0]), float(nums[1])
50
+ except ValueError:
51
+ return np.nan, np.nan
52
+ elif len(nums) == 1: # Single expected value
53
+ try:
54
+ return float(nums[0]), float(nums[0])
55
+ except ValueError:
56
+ return np.nan, np.nan
57
+ return np.nan, np.nan
58
+
59
+ # Handle numeric ranges (e.g., "24.8-25.75", "6.49-9.64")
60
+ if '-' in value_str:
61
+ try:
62
+ parts = [float(p.strip()) for p in value_str.split('-')]
63
+ return min(parts), max(parts)
64
+ except ValueError:
65
+ return np.nan, np.nan
66
+ else: # Single numeric value (e.g., "23.27", "12.1")
67
+ try:
68
+ val = float(value_str)
69
+ return val, val
70
+ except ValueError:
71
+ return np.nan, np.nan
72
+
73
+ # Apply the parsing function to create min/max columns
74
+ df[['mileage_kmpl_min', 'mileage_kmpl_max']] = df['mileage_kmpl'].apply(lambda x: pd.Series(parse_price_mileage_range(x)))
75
+ df[['price_lakh_min', 'price_lakh_max']] = df['price_lakh'].apply(lambda x: pd.Series(parse_price_mileage_range(x)))
76
+
77
+ # Infer new columns from 'engine' and 'features' for better filtering
78
+ def infer_car_attributes(row: pd.Series) -> pd.Series:
79
+ engine = row['engine'].lower()
80
+ features = row['features'].lower()
81
+
82
+ # Fuel Type
83
+ fuel_type = 'petrol' # Default
84
+ if 'diesel' in engine or 'diesel' in features:
85
+ fuel_type = 'diesel'
86
+ elif 'cng' in engine or 'cng' in features:
87
+ fuel_type = 'cng'
88
+ elif 'electric' in engine or 'electric' in features or 'motor' in engine: # Covers Permanent Magnet Synchronous Motor
89
+ fuel_type = 'electric'
90
+ elif 'hybrid' in engine or 'hybrid' in features:
91
+ fuel_type = 'hybrid' # Can be strong or mild, more specific parsing needed for differentiation
92
+
93
+ # Transmission
94
+ transmission = 'manual' # Default
95
+ if 'automatic' in features or 'at' in features or 'amt' in features or 'dct' in features or 'cvt' in features:
96
+ transmission = 'automatic'
97
+ elif 'paddle shifters' in features: # Usually implies AT
98
+ transmission = 'automatic'
99
+ elif 'manual' in features or 'mt' in features: # Explicitly manual
100
+ transmission = 'manual' # Manual overrides automatic if both are mentioned, pick one convention.
101
+ # For cars with both (e.g., different variants), this will just pick one.
102
+ # A more complex model would store both or create specific flags.
103
+
104
+ # Seating Capacity
105
+ seating_capacity = np.nan
106
+ seat_match = re.search(r'(\d+)-seater', features)
107
+ if seat_match:
108
+ seating_capacity = int(seat_match.group(1))
109
+ elif '6/7 seater' in features: # Common pattern
110
+ seating_capacity = 7 # Assume 7 for flexibility, or you can pick 6
111
+ elif 'true 7-seater' in features:
112
+ seating_capacity = 7
113
+ elif 'modular 7-seater' in features:
114
+ seating_capacity = 7
115
+ elif '5 seater' in features: # Specific for 5 seater
116
+ seating_capacity = 5
117
+
118
+ # Body Type (more complex, using keywords)
119
+ body_type = 'other'
120
+ if 'suv' in features or 'suv' in row['model'].lower():
121
+ body_type = 'suv'
122
+ elif 'sedan' in features or 'sedan' in row['model'].lower():
123
+ body_type = 'sedan'
124
+ elif 'hatchback' in features or 'hatchback' in row['model'].lower():
125
+ body_type = 'hatchback'
126
+ elif 'muv' in features:
127
+ body_type = 'muv'
128
+ elif 'pickup truck' in features:
129
+ body_type = 'pickup'
130
+
131
+ return pd.Series({
132
+ 'fuel_type': fuel_type,
133
+ 'transmission': transmission,
134
+ 'seating_capacity': seating_capacity,
135
+ 'body_type': body_type
136
+ })
137
+
138
+ # Apply attribute inference to the DataFrame
139
+ df = df.assign(**df.apply(infer_car_attributes, axis=1).to_dict('list'))
140
+
141
+ # Convert numeric columns to appropriate types, coercing errors to NaN
142
+ numeric_cols = ['mileage_kmpl_min', 'mileage_kmpl_max', 'price_lakh_min', 'price_lakh_max', 'seating_capacity']
143
+ for col in numeric_cols:
144
+ df[col] = pd.to_numeric(df[col], errors='coerce')
145
+
146
 
147
  # Control long responses
148
  MAX_TOTAL_CHARACTERS = 5000
149
 
150
+ # --- Helper Functions for Query Parsing ---
151
+
152
  def extract_numbers(text: str) -> List[float]:
153
  """Extract all numbers from text"""
154
  return [float(x) for x in re.findall(r'\d+\.?\d*', text)]
 
156
  def find_brand_mentions(query: str) -> List[str]:
157
  """Find all brand mentions in query"""
158
  unique_brands = df["brand"].unique()
159
+ # Use a more specific regex to avoid partial matches and prefer full words
160
+ found_brands = []
161
+ for brand in unique_brands:
162
+ if re.search(r'\b' + re.escape(brand) + r'\b', query.lower()):
163
+ found_brands.append(brand)
164
+ return found_brands
165
 
166
  def find_model_mentions(query: str) -> List[str]:
167
  """Find all model mentions in query"""
168
  unique_models = df["model"].str.lower().unique()
169
+ found_models = []
170
+ for model in unique_models:
171
+ if re.search(r'\b' + re.escape(model) + r'\b', query.lower()):
172
+ found_models.append(model)
173
+ return found_models
174
 
175
  def extract_price_range(query: str) -> tuple:
176
  """Extract price range from query"""
177
  min_price, max_price = None, None
178
+ query = query.lower()
179
 
180
+ # Pattern for "under X", "below X", "less than X", "up to X"
181
+ under_match = re.search(r'(?:under|below|less than|up to)\s*₹?(\d+\.?\d*)', query)
182
  if under_match:
183
  max_price = float(under_match.group(1))
184
+
185
+ # Pattern for "above X", "more than X", "at least X", "over X"
186
+ above_match = re.search(r'(?:above|more than|at least|over)\s*₹?(\d+\.?\d*)', query)
187
  if above_match:
188
  min_price = float(above_match.group(1))
189
+
190
  # Pattern for "between X and Y"
191
+ between_match = re.search(r'between\s*₹?(\d+\.?\d*)\s*(?:and|to)\s*₹?(\d+\.?\d*)', query)
192
  if between_match:
193
+ p1 = float(between_match.group(1))
194
+ p2 = float(between_match.group(2))
195
+ min_price = min(p1, p2)
196
+ max_price = max(p1, p2)
197
+
198
  # Pattern for "around X", "approximately X"
199
+ around_match = re.search(r'(?:around|approximately|about)\s*₹?(\d+\.?\d*)', query)
200
  if around_match:
201
  target = float(around_match.group(1))
202
+ min_price = target * 0.8 # +/- 20% tolerance for "around"
203
+ max_price = target * 1.2
204
+
205
  return min_price, max_price
206
 
207
  def extract_mileage_range(query: str) -> tuple:
208
  """Extract mileage requirements from query"""
209
  min_mileage, max_mileage = None, None
210
+ query = query.lower()
211
 
212
+ mileage_keywords = ['mileage', 'fuel efficiency', 'kmpl', 'fuel economy', 'range']
213
+ has_mileage_context = any(keyword in query for keyword in mileage_keywords)
 
214
 
215
  if has_mileage_context:
216
  # Pattern for "above X kmpl", "more than X kmpl"
217
+ above_match = re.search(r'(?:above|more than|at least|over)\s*(\d+\.?\d*)(?:\s*kmpl|\s*km range)?', query)
218
  if above_match:
219
  min_mileage = float(above_match.group(1))
220
+
221
  # Pattern for "below X kmpl", "under X kmpl"
222
+ below_match = re.search(r'(?:below|under|less than)\s*(\d+\.?\d*)(?:\s*kmpl|\s*km range)?', query)
223
  if below_match:
224
  max_mileage = float(below_match.group(1))
225
+
226
  return min_mileage, max_mileage
227
 
228
+ def extract_features_from_query(query: str) -> List[str]:
229
+ """Extract general feature requirements from query that map to the 'features' column."""
230
+ query = query.lower()
231
+ # Expanded list of relevant keywords to search in the 'features' column
232
+ # Ensure these are distinct from attributes derived into separate columns
233
+ general_feature_keywords = [
234
+ "sunroof", "panoramic sunroof", "360-degree camera", "head-up display",
235
+ "hud", "wireless charging", "ambient lighting", "cruise control",
236
+ "rear ac vents", "push start/stop", "electrically adjustable orvms",
237
+ "automatic climate control", "digital speedometer", "dual airbags",
238
+ "abs", "ebd", "reverse parking sensors", "connected car tech", "bluelink",
239
+ "adas", "hyundai smartsense", "honda sensing", "ventilated front seats",
240
+ "dual zone climate control", "electronic parking brake", "auto hold",
241
+ "apple carplay", "android auto", "bose premium sound", "jbl sound system",
242
+ "powered tailgate", "terrain response modes", "digital instrument cluster",
243
+ "air purifier", "traction pro mode", "dca automatic", "voice assistant",
244
+ "paddle shifters", "allgrip awd", "4x4", "ladder frame chassis", "hill hold assist",
245
+ "hill descent control", "ventilated front cup holders", "smart hybrid technology",
246
+ "uv-cut glass", "boosterjet engine", "ADAS", "i-smart", "digital bluetooth key",
247
+ "heated orvms", "personal ai assistant", "ultrafast charging", "v2l",
248
+ "sliding center console", "panoramic vision roof", "meridian premium sound",
249
+ "augmented reality hud", "multi-terrain modes", "multi-mode regen", "frunk",
250
+ "voice-enabled sunroof", "dashcam", "footwell lighting", "heated seats",
251
+ "powered driver seat", "triple-zone climate control", "differential lock",
252
+ "active traction control", "dac", "cooler box", "lane watch camera",
253
+ "multi-sense driving modes", "easyfix seats", "detachable 3rd row",
254
+ "digital cockpit", "4motion all-wheel drive", "park assist", "citroen advanced comfort",
255
+ "customization options", "high ground clearance", "comfortable ride",
256
+ "roof mounted rear ac vents", "true 7-seater", "washable interior",
257
+ "tyre pressure monitoring system", "mld technology", "micro hybrid technology",
258
+ "static bending headlamps", "robust build quality", "harmon infotainment",
259
+ "corner stability control", "rain sensing wipers", "automatic headlamps",
260
+ "multi-zone climate control"
261
  ]
262
 
263
+ matched_features = []
264
+ for keyword in general_feature_keywords:
265
+ if keyword in query:
266
+ matched_features.append(keyword)
267
+ return matched_features
268
 
269
  def get_comparison_cars(query: str) -> List[Dict]:
270
+ """Handle comparison queries by identifying car names mentioned."""
271
+ query_lower = query.lower()
 
 
 
 
 
 
272
 
273
+ # Keywords indicating a comparison
274
+ comparison_words = ['vs', 'versus', 'compare', 'comparison', 'better', 'which is', 'difference between']
275
+ if not any(word in query_lower for word in comparison_words):
276
+ return []
277
+
278
+ # Attempt to find two distinct car models for comparison
279
+ car_names_in_query = []
280
+ # Create a list of all possible full car names (brand + model) and model names
281
+ all_car_ids = df['brand'] + ' ' + df['model']
282
+ all_models = df['model']
283
+
284
+ # Prioritize full car names, then standalone models if they are unique enough
285
+ for i, full_name in all_car_ids.items():
286
+ if full_name.lower() in query_lower and full_name.lower() not in [c['full_name'] for c in car_names_in_query]:
287
+ car_names_in_query.append({'type': 'full', 'name': df.loc[i, 'model'].lower(), 'brand': df.loc[i, 'brand'].lower(), 'full_name': full_name.lower()})
288
+
289
+ # If we found at least two specific cars, proceed
290
+ if len(car_names_in_query) >= 2:
291
+ # Filter df to get details of these specific cars
292
+ compared_models = [car['name'] for car in car_names_in_query]
293
+ return df[df["model"].str.lower().isin(compared_models)].to_dict('records')
294
 
295
  return []
296
 
297
  def handle_specific_questions(query: str) -> str:
298
+ """Handle specific question types like 'cheapest', 'best mileage', 'how many', 'average'."""
299
  query_lower = query.lower()
300
 
301
+ # Price questions (using _min columns)
302
  if any(word in query_lower for word in ['cheapest', 'lowest price', 'most affordable']):
303
+ cheapest = df.loc[df['price_lakh_min'].idxmin()]
304
+ return f"💰 The cheapest car in our database is the {cheapest['brand'].title()} {cheapest['model']} at ₹{cheapest['price_lakh_min']:.2f} Lakh."
305
 
306
  if any(word in query_lower for word in ['most expensive', 'highest price', 'premium']):
307
+ expensive = df.loc[df['price_lakh_min'].idxmax()]
308
+ return f"💎 The most expensive car in our database is the {expensive['brand'].title()} {expensive['model']} at ₹{expensive['price_lakh_min']:.2f} Lakh."
309
 
310
+ # Mileage questions (using _min columns)
311
  if any(word in query_lower for word in ['best mileage', 'highest mileage', 'most fuel efficient']):
312
+ best_mileage = df.loc[df['mileage_kmpl_min'].idxmax()]
313
+ # Check if it's an EV range
314
+ mileage_text = f"{best_mileage['mileage_kmpl_min']} kmpl"
315
+ if 'electric' in best_mileage['fuel_type'] or 'hybrid' in best_mileage['fuel_type']:
316
+ mileage_text = f"{best_mileage['mileage_kmpl_min']} km range (Electric/Hybrid)"
317
+ return f"⛽ The car with the best mileage/range is the {best_mileage['brand'].title()} {best_mileage['model']} with {mileage_text}."
318
 
319
  if any(word in query_lower for word in ['worst mileage', 'lowest mileage', 'least fuel efficient']):
320
+ worst_mileage = df.loc[df['mileage_kmpl_min'].idxmin()]
321
+ mileage_text = f"{worst_mileage['mileage_kmpl_min']} kmpl"
322
+ if 'electric' in worst_mileage['fuel_type'] or 'hybrid' in worst_mileage['fuel_type']:
323
+ mileage_text = f"{worst_mileage['mileage_kmpl_min']} km range (Electric/Hybrid)"
324
+ return f"⛽ The car with the lowest mileage/range is the {worst_mileage['brand'].title()} {worst_mileage['model']} with {mileage_text}."
325
 
326
  # Count questions
327
  if any(word in query_lower for word in ['how many', 'count', 'number of']):
328
+ brands = find_brand_mentions(query)
329
+ if brands:
330
+ brand = brands[0]
331
  count = len(df[df['brand'] == brand])
332
+ return f"📊 {brand.title()} has {count} car models in our database."
333
+
334
+ fuel_types = ['petrol', 'diesel', 'electric', 'cng', 'hybrid']
335
+ for ft in fuel_types:
336
+ if ft in query_lower and 'car' in query_lower:
337
+ count = len(df[df['fuel_type'] == ft])
338
+ return f"📊 There are {count} {ft.title()} car models in our database."
339
+
340
+ transmissions = ['automatic', 'manual']
341
+ for tr in transmissions:
342
+ if tr in query_lower and 'car' in query_lower:
343
+ count = len(df[df['transmission'] == tr])
344
+ return f"📊 There are {count} {tr.title()} transmission car models in our database."
345
+
346
+ seating_match = re.search(r'(\d+)\s*seat', query_lower)
347
+ if seating_match:
348
+ seats = int(seating_match.group(1))
349
+ count = len(df[df['seating_capacity'] == seats])
350
+ return f"��� There are {count} car models with {seats} seats in our database."
351
+
352
+ return f"📊 Total cars in database: {len(df)}."
353
 
354
  # Average questions
355
  if 'average' in query_lower:
356
  if 'price' in query_lower:
357
+ avg_price = df['price_lakh_min'].mean()
358
+ return f"📊 The average minimum car price in our database is ₹{avg_price:.2f} Lakh."
359
+ elif 'mileage' in query_lower or 'fuel efficiency' in query_lower:
360
+ avg_mileage = df['mileage_kmpl_min'].mean()
361
+ return f"📊 The average minimum mileage/range in our database is {avg_mileage:.2f} kmpl/km."
362
 
363
+ # Brand-specific models/variants
364
  brands = find_brand_mentions(query)
365
+ if brands and any(word in query_lower for word in ['models', 'variants', 'options', 'cars']):
366
  brand = brands[0]
367
  brand_cars = df[df['brand'] == brand]
368
  models = brand_cars['model'].unique()
369
+ return f"🚗 {brand.title()} has the following models in our database: {', '.join(models)}."
370
 
371
+ # Handle subjective questions that can't be answered directly
372
+ if 'best features' in query_lower or 'most luxurious' in query_lower or 'most reliable' in query_lower or 'safest' in query_lower:
373
+ return "🤔 That's a great question, but 'best' or 'most luxurious' can be subjective! I can tell you about specific features if you ask, or list cars based on quantifiable criteria like price, mileage, or presence of ADAS/sunroof."
374
+
375
  return ""
376
 
377
  def format_car_details(car: Dict, show_features: bool = True, compact: bool = False) -> str:
378
+ """Format car details for display."""
379
+ mileage_text = f"{car.get('mileage_kmpl_min', 'N/A')} kmpl"
380
+ if 'electric' in str(car.get('fuel_type', '')).lower() or 'hybrid' in str(car.get('fuel_type', '')).lower():
381
+ mileage_text = f"{car.get('mileage_kmpl_min', 'N/A')} km range"
382
+
383
  if compact:
384
+ return (
385
+ f"🚗 {car.get('brand', '').title()} {car.get('model', '')} | "
386
+ f"₹{car.get('price_lakh_min', 'N/A')}L | {mileage_text} | "
387
+ f"{car.get('engine', 'N/A')}"
388
+ )
389
+
390
  features_text = ""
391
+ if show_features and 'features' in car and pd.notna(car['features']):
392
+ features = car['features']
393
+ # Limit features to 200 chars and append "..." if truncated
394
+ display_features = features[:200]
395
+ if len(features) > 200:
396
+ display_features += "..."
397
+ features_text = f"- Key Features: {display_features.title()}\n"
398
+
399
+ details = f"""🚗 {car.get('brand', '').title()} {car.get('model', '')}
400
+ - Engine: {car.get('engine', 'N/A')}
401
+ - Fuel Type: {str(car.get('fuel_type', 'N/A')).title()}
402
+ - Transmission: {str(car.get('transmission', 'N/A')).title()}
403
+ - Mileage/Range: {mileage_text}
404
+ - Price: ₹{car.get('price_lakh_min', 'N/A')}-{car.get('price_lakh_max', 'N/A')} Lakh
405
+ """
406
+ if pd.notna(car.get('seating_capacity')):
407
+ details += f"- Seating: {int(car['seating_capacity'])}-seater\n"
408
+ if pd.notna(car.get('body_type')):
409
+ details += f"- Body Type: {str(car.get('body_type', 'N/A')).title()}\n"
410
+
411
+ details += features_text
412
+ return details
413
+
414
+ # --- Main Answer Function ---
415
 
416
  def answer_question(query: str) -> str:
417
  if not query.strip():
418
  return "❓ Please ask me something about Indian cars!"
419
 
420
  query = query.strip()
421
+ query_lower = query.lower() # Use this for all case-insensitive checks
422
+
423
+ # 1. Handle specific questions (e.g., "cheapest car", "how many Maruti cars")
424
  specific_answer = handle_specific_questions(query)
425
  if specific_answer:
426
  return specific_answer
427
 
428
+ # 2. Handle direct car comparisons (e.g., "Creta vs Seltos")
429
  comparison_cars = get_comparison_cars(query)
430
  if comparison_cars:
431
+ response = "📊 Here's a comparison of the cars you asked about:\n\n"
432
+ for car in comparison_cars:
433
+ response += format_car_details(car, show_features=True) + "\n"
434
+ if len(response) > MAX_TOTAL_CHARACTERS * 0.8: # Limit comparison length
435
+ response += "\n... (some details truncated for brevity)\n"
436
+ break
437
  return response.strip()
438
 
439
+ # 3. Check for single specific car mention (e.g., "details of Tata Nexon")
440
+ # This should be after comparison to allow "compare Nexon vs Harrier"
441
  for _, row in df.iterrows():
442
  car_name = f"{row['brand']} {row['model']}".lower()
443
+ # Look for full car name or model name preceded by brand
444
+ if car_name in query_lower or (row['model'].lower() in query_lower and row['brand'].lower() in query_lower):
445
+ # Ensure it's a specific query for details, not a filter
446
+ if any(k in query_lower for k in ['details', 'info', 'specifications', 'tell me about', 'what is the']):
447
+ return f"📌 {row['brand'].title()} {row['model']} Details:\n" + format_car_details(row.to_dict())
448
 
449
+ # 4. General Filtering based on Criteria
450
  filtered_df = df.copy()
451
 
452
  # Filter by brand
453
  brands = find_brand_mentions(query)
454
  if brands:
455
  filtered_df = filtered_df[filtered_df["brand"].isin(brands)]
456
+
457
+ # Filter by model (if specific models are requested alongside other filters)
458
  models = find_model_mentions(query)
459
  if models:
460
  filtered_df = filtered_df[filtered_df["model"].str.lower().isin(models)]
461
+
462
  # Filter by price
463
  min_price, max_price = extract_price_range(query)
464
  if min_price is not None:
465
+ filtered_df = filtered_df[filtered_df["price_lakh_min"] >= min_price]
466
  if max_price is not None:
467
+ filtered_df = filtered_df[filtered_df["price_lakh_max"] <= max_price] # Use max_price_max for upper bound
468
+
469
+ # Filter by mileage/range
470
  min_mileage, max_mileage = extract_mileage_range(query)
471
  if min_mileage is not None:
472
+ filtered_df = filtered_df[filtered_df["mileage_kmpl_min"] >= min_mileage]
473
  if max_mileage is not None:
474
+ filtered_df = filtered_df[filtered_df["mileage_kmpl_max"] <= max_mileage] # Use mileage_kmpl_max for upper bound
475
+
476
+ # Filter by fuel type
477
+ fuel_types_in_query = []
478
+ if 'petrol' in query_lower: fuel_types_in_query.append('petrol')
479
+ if 'diesel' in query_lower: fuel_types_in_query.append('diesel')
480
+ if 'electric' in query_lower or 'ev' in query_lower: fuel_types_in_query.append('electric')
481
+ if 'cng' in query_lower: fuel_types_in_query.append('cng')
482
+ if 'hybrid' in query_lower: fuel_types_in_query.append('hybrid') # Captures both strong and mild
483
+
484
+ if fuel_types_in_query:
485
+ filtered_df = filtered_df[filtered_df['fuel_type'].isin(fuel_types_in_query)]
486
+
487
+ # Filter by transmission
488
+ transmission_types_in_query = []
489
+ if 'automatic' in query_lower or 'auto' in query_lower: transmission_types_in_query.append('automatic')
490
+ if 'manual' in query_lower: transmission_types_in_query.append('manual')
491
+
492
+ if transmission_types_in_query:
493
+ filtered_df = filtered_df[filtered_df['transmission'].isin(transmission_types_in_query)]
494
+
495
+ # Filter by seating capacity
496
+ seating_match = re.search(r'(\d+)\s*seater', query_lower)
497
+ if seating_match:
498
+ seats = int(seating_match.group(1))
499
+ # Allow for "5-seater" queries to include vehicles that *can* be 5-seater
500
+ filtered_df = filtered_df[filtered_df['seating_capacity'] == seats]
501
+
502
+ # Filter by body type
503
+ body_types_in_query = []
504
+ if 'suv' in query_lower: body_types_in_query.append('suv')
505
+ if 'sedan' in query_lower: body_types_in_query.append('sedan')
506
+ if 'hatchback' in query_lower: body_types_in_query.append('hatchback')
507
+ if 'muv' in query_lower: body_types_in_query.append('muv')
508
+ if 'pickup' in query_lower: body_types_in_query.append('pickup')
509
+
510
+ if body_types_in_query:
511
+ filtered_df = filtered_df[filtered_df['body_type'].isin(body_types_in_query)]
512
+
513
+
514
+ # Filter by general features (from 'features' column)
515
+ general_features_from_query = extract_features_from_query(query)
516
+ for feature_keyword in general_features_from_query:
517
+ filtered_df = filtered_df[filtered_df["features"].str.contains(feature_keyword, na=False)]
518
+
519
+ # Final Response Generation
520
  if filtered_df.empty:
521
+ return "❌ No matching cars found for your query. Please try adjusting your requirements or asking a broader question."
522
+
523
  response = ""
524
 
525
+ # Sort results based on query intent
526
+ if any(word in query_lower for word in ['cheap', 'affordable', 'budget', 'lowest price']):
527
+ filtered_df = filtered_df.sort_values('price_lakh_min')
528
+ elif any(word in query_lower for word in ['expensive', 'premium', 'luxury', 'highest price']):
529
+ filtered_df = filtered_df.sort_values('price_lakh_min', ascending=False)
530
+ elif any(word in query_lower for word in ['mileage', 'fuel efficient', 'economy', 'best mileage']):
531
+ filtered_df = filtered_df.sort_values('mileage_kmpl_min', ascending=False)
532
+ else: # Default sort
533
+ filtered_df = filtered_df.sort_values('price_lakh_min')
534
+
535
+ # Determine how many cars to show in detail vs. compact list
536
+ total_matches = len(filtered_df)
537
+
538
+ # If a very broad query resulting in many cars from one brand, or just a brand name, show compact list
539
  is_simple_brand_query = (
540
+ len(brands) == 1 and total_matches > 5 and
541
+ not models and not min_price and not max_price and
542
+ not min_mileage and not max_mileage and not general_features_from_query and
543
+ not any(ft in query_lower for ft in fuel_types_in_query) and
544
+ not any(tt in query_lower for tt in transmission_types_in_query) and
545
+ not seating_match and not any(bt in query_lower for bt in body_types_in_query) and
546
+ not any(word in query_lower for word in ['cheap', 'expensive', 'best', 'compare', 'vs', 'average', 'count', 'how many'])
547
  )
548
+
549
+ if is_simple_brand_query:
550
+ response += f"🏷️ Found {total_matches} {brands[0].title()} models. Here's a summary:\n\n"
 
551
  for _, row in filtered_df.iterrows():
552
+ entry = format_car_details(row.to_dict(), show_features=False, compact=True) + "\n"
553
+ if len(response + entry) > MAX_TOTAL_CHARACTERS * 0.9:
554
+ response += "\n... (further results truncated for length)\n"
555
+ break
556
+ response += entry
557
 
558
+ # Add a summary for simple brand queries
559
+ avg_price = filtered_df['price_lakh_min'].mean()
560
+ avg_mileage = filtered_df['mileage_kmpl_min'].mean()
561
+ price_range = f"₹{filtered_df['price_lakh_min'].min():.2f}-{filtered_df['price_lakh_max'].max():.2f} Lakh"
562
+ response += f"\n📊 Summary for {brands[0].title()} cars: Avg. Min Price: ₹{avg_price:.2f}L | Avg. Min Mileage/Range: {avg_mileage:.2f} | Price Range: {price_range}"
563
 
564
  else:
565
+ # For more specific filters, show more detailed info, up to a limit
566
+ response_header = f"Found {total_matches} matching cars:\n\n" if total_matches > 1 else "Found 1 matching car:\n\n"
567
+ response += response_header
568
 
569
+ # Limit detailed results to make response manageable
570
+ max_detailed_cars = 8 if total_matches <= 10 else 5
571
 
572
  for _, row in filtered_df.head(max_detailed_cars).iterrows():
573
+ entry = format_car_details(row.to_dict(), show_features=True) + "\n"
574
  if len(response + entry) > MAX_TOTAL_CHARACTERS:
575
+ response += f"\n... and {total_matches - filtered_df.head(max_detailed_cars).shape[0]} more cars match your criteria. Please refine your search."
576
  break
577
  response += entry
578
+
579
+ if total_matches > max_detailed_cars and len(response) <= MAX_TOTAL_CHARACTERS:
580
+ response += f"\n... and {total_matches - max_detailed_cars} more cars match your criteria."
581
+
582
  return response.strip()
583
 
584
  # Enhanced Gradio interface
 
590
  "Best mileage car under 10 lakhs",
591
  "Mahindra cars with price and mileage",
592
  "Cars between 5 and 15 lakhs",
 
593
  "Show me all Honda models",
594
+ "Average price of cars in database",
595
+ "Electric cars with ADAS",
596
+ "SUVs with 7 seats and good mileage",
597
+ "Diesel cars from Toyota",
598
+ "Tell me about the Skoda Slavia"
599
  ]
600
 
601
  gr.Interface(
 
610
  label="Car Information"
611
  ),
612
  title="🚘 Enhanced Indian Car AI Assistant",
613
+ description="Ask me anything about Indian cars! I can help with comparisons, recommendations, specifications, and more. Data based on 'indian_car_info.csv'.",
614
  examples=examples,
615
  theme="soft"
616
  ).launch()