Carz / src /convert_data.py
umer6016
Fresh Deploy
1d12e97
import pandas as pd
from rdflib import Graph, Namespace, RDF, RDFS, OWL, Literal, XSD, URIRef
import re
import os
# Define Namespace
EX = Namespace("http://example.org/cars/")
def clean_price(value):
if pd.isna(value): return 0.0
val_str = str(value).replace('$', '').replace(',', '').strip()
match = re.search(r'([\d\.]+)', val_str)
return float(match.group(1)) if match else 0.0
def clean_number(value):
if pd.isna(value): return 0
match = re.search(r'([\d\.,]+)', str(value))
if match:
return float(match.group(1).replace(',', ''))
return 0
def clean_seats(value):
if pd.isna(value): return 2
match = re.search(r'(\d+)', str(value))
return int(match.group(1)) if match else 2
def convert_data():
# Load or Create Graph
g = Graph()
g.bind("ex", EX)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
# Load Ontology T-Box (if exists, to keep definitions)
if os.path.exists("cars_ontology.ttl"):
g.parse("cars_ontology.ttl", format="turtle")
# DBpedia Mappings
dbpedia_manufacturers = {
"FERRARI": "http://dbpedia.org/resource/Ferrari",
"ROLLS ROYCE": "http://dbpedia.org/resource/Rolls-Royce_Motor_Cars",
"FORD": "http://dbpedia.org/resource/Ford_Motor_Company",
"MERCEDES": "http://dbpedia.org/resource/Mercedes-Benz",
"AUDI": "http://dbpedia.org/resource/Audi",
"BMW": "http://dbpedia.org/resource/BMW",
"ASTON MARTIN": "http://dbpedia.org/resource/Aston_Martin",
"BENTLEY": "http://dbpedia.org/resource/Bentley",
"LAMBORGHINI": "http://dbpedia.org/resource/Lamborghini",
"TOYOTA": "http://dbpedia.org/resource/Toyota",
"NISSAN": "http://dbpedia.org/resource/Nissan",
"VOLVO": "http://dbpedia.org/resource/Volvo_Cars",
"KIA": "http://dbpedia.org/resource/Kia",
"HONDA": "http://dbpedia.org/resource/Honda",
"HYUNDAI": "http://dbpedia.org/resource/Hyundai_Motor_Company",
"MAHINDRA": "http://dbpedia.org/resource/Mahindra_&_Mahindra",
"MARUTI SUZUKI": "http://dbpedia.org/resource/Maruti_Suzuki",
"VOLKSWAGEN": "http://dbpedia.org/resource/Volkswagen",
"PORSCHE": "http://dbpedia.org/resource/Porsche",
"CADILLAC": "http://dbpedia.org/resource/Cadillac",
"TATA MOTORS": "http://dbpedia.org/resource/Tata_Motors",
"TESLA": "http://dbpedia.org/resource/Tesla,_Inc.",
"JEEP": "http://dbpedia.org/resource/Jeep",
"MAZDA": "http://dbpedia.org/resource/Mazda",
"CHEVROLET": "http://dbpedia.org/resource/Chevrolet",
"GMC": "http://dbpedia.org/resource/GMC_(automobile)",
"PEUGEOT": "http://dbpedia.org/resource/Peugeot",
"BUGATTI": "http://dbpedia.org/resource/Bugatti_Automobiles",
"JAGUAR LAND ROVER": "http://dbpedia.org/resource/Jaguar_Land_Rover",
"ACURA": "http://dbpedia.org/resource/Acura",
"MITSUBISHI": "http://dbpedia.org/resource/Mitsubishi_Motors"
}
dbpedia_body = {
"Coupe": "http://dbpedia.org/resource/Coupe",
"Sedan": "http://dbpedia.org/resource/Sedan_(automobile)",
"SUV": "http://dbpedia.org/resource/Sport_utility_vehicle",
"SuperCar": "http://dbpedia.org/resource/Supercar",
"Car": "http://dbpedia.org/resource/Car"
}
# Fuel Mappings
dbpedia_fuels = {
"PETROL": "http://dbpedia.org/resource/Gasoline",
"DIESEL": "http://dbpedia.org/resource/Diesel_fuel",
"ELECTRIC": "http://dbpedia.org/resource/Electric_vehicle", # Linking to EV concept for fuel type context
"HYBRID": "http://dbpedia.org/resource/Hybrid_vehicle",
"PLUG-IN HYBRID": "http://dbpedia.org/resource/Plug-in_hybrid",
"HYDROGEN": "http://dbpedia.org/resource/Hydrogen_fuel",
"CNG": "http://dbpedia.org/resource/Compressed_natural_gas"
}
# Engine Mappings (Common types)
dbpedia_engines = {
"V8": "http://dbpedia.org/resource/V8_engine",
"V10": "http://dbpedia.org/resource/V10_engine",
"V12": "http://dbpedia.org/resource/V12_engine",
"V6": "http://dbpedia.org/resource/V6_engine",
"W12": "http://dbpedia.org/resource/W12_engine",
"W16": "http://dbpedia.org/resource/W16_engine",
"I4": "http://dbpedia.org/resource/Inline-four_engine",
"ELECTRIC": "http://dbpedia.org/resource/Electric_motor"
}
# Load CSV
csv_path = "../Cars Datasets 2025.csv"
if not os.path.exists(csv_path):
csv_path = "Cars Datasets 2025.csv"
try:
df = pd.read_csv(csv_path, encoding='latin1')
except Exception as e:
print(f"Error reading CSV: {e}")
return
print(f"Processing {len(df)} rows...")
for index, row in df.iterrows():
# Clean Data
car_name = str(row['Cars Names']).strip()
comp_name_raw = str(row['Company Names']).strip()
comp_name_upper = comp_name_raw.upper()
# Normalize Company Name for URI
comp_uri_suffix = comp_name_upper.replace(" ", "_")
comp_uri = EX[comp_uri_suffix]
car_uri = EX[car_name.replace(" ", "_").replace("/", "-").replace("(", "").replace(")", "")]
# Add Type
g.add((car_uri, RDF.type, EX.Car))
g.add((comp_uri, RDF.type, EX.Manufacturer))
# Interlinking: Manufacturer
if comp_name_upper in dbpedia_manufacturers:
g.add((comp_uri, OWL.sameAs, URIRef(dbpedia_manufacturers[comp_name_upper])))
# Fuel Type Logic
fuel_raw = str(row['Fuel Types']).strip()
fuel_clean = "PETROL" # Default
if "diesel" in fuel_raw.lower(): fuel_clean = "DIESEL"
elif "electric" in fuel_raw.lower() and "hybrid" not in fuel_raw.lower(): fuel_clean = "ELECTRIC"
elif "plug" in fuel_raw.lower(): fuel_clean = "PLUG-IN HYBRID"
elif "hybrid" in fuel_raw.lower(): fuel_clean = "HYBRID"
elif "hydrogen" in fuel_raw.lower(): fuel_clean = "HYDROGEN"
elif "cng" in fuel_raw.lower(): fuel_clean = "CNG"
fuel_uri = EX[fuel_clean.replace(" ", "_").replace("-", "_")]
g.add((fuel_uri, RDF.type, EX.FuelType))
g.add((car_uri, EX.usesFuel, fuel_uri))
if fuel_clean in dbpedia_fuels:
g.add((fuel_uri, OWL.sameAs, URIRef(dbpedia_fuels[fuel_clean])))
# Engine Logic
engine_raw = str(row['Engines']).strip()
engine_clean = "Engine"
if "v8" in engine_raw.lower(): engine_clean = "V8"
elif "v12" in engine_raw.lower(): engine_clean = "V12"
elif "v10" in engine_raw.lower(): engine_clean = "V10"
elif "v6" in engine_raw.lower(): engine_clean = "V6"
elif "w12" in engine_raw.lower(): engine_clean = "W12"
elif "w16" in engine_raw.lower(): engine_clean = "W16"
engine_uri = EX[engine_clean.replace(" ", "_")]
g.add((engine_uri, RDF.type, EX.Engine))
g.add((car_uri, EX.hasEngine, engine_uri))
if engine_clean in dbpedia_engines:
g.add((engine_uri, OWL.sameAs, URIRef(dbpedia_engines[engine_clean])))
# Determine Car Subclass & Interlinking
seats = clean_seats(row['Seats'])
price = clean_price(row['Cars Prices'])
top_speed = clean_number(row['Total Speed'])
car_type = EX.Car
if seats == 2:
car_type = EX.Coupe
g.add((car_uri, RDF.type, EX.Coupe))
g.add((EX.Coupe, OWL.sameAs, URIRef(dbpedia_body["Coupe"]))) # Class Level link (optional but good)
elif seats >= 4:
car_type = EX.Sedan
g.add((car_uri, RDF.type, EX.Sedan))
g.add((EX.Sedan, OWL.sameAs, URIRef(dbpedia_body["Sedan"])))
if top_speed > 300:
g.add((car_uri, RDF.type, EX.SuperCar))
g.add((EX.SuperCar, OWL.sameAs, URIRef(dbpedia_body["SuperCar"])))
# Add Properties
g.add((car_uri, EX.hasManufacturer, comp_uri))
g.add((car_uri, RDFS.label, Literal(car_name, datatype=XSD.string)))
g.add((comp_uri, RDFS.label, Literal(comp_name_raw, datatype=XSD.string)))
g.add((fuel_uri, RDFS.label, Literal(fuel_clean, datatype=XSD.string)))
g.add((car_uri, EX.hasPriceValue, Literal(price, datatype=XSD.float)))
g.add((car_uri, EX.hasSeatCount, Literal(seats, datatype=XSD.integer)))
g.add((car_uri, EX.hasTopSpeedKMH, Literal(int(top_speed), datatype=XSD.integer)))
hp = clean_number(row['HorsePower'])
g.add((car_uri, EX.hasHorsePowerValue, Literal(int(hp), datatype=XSD.integer)))
# Save Graph
g.serialize(destination="cars_knowledge_graph.ttl", format="turtle")
print(f"Knowledge Graph saved to cars_knowledge_graph.ttl with {len(g)} triples.")
if __name__ == "__main__":
convert_data()