data-gen / seed /csv_cleaner.py
ashish-sarvam's picture
Upload folder using huggingface_hub
fc1a684 verified
import json
import pandas as pd
import re
def parse_company_csv(path):
# read raw file (it may not be a valid csv, so read as single column)
df = pd.read_csv(path, header=None, names=["raw"], dtype=str)
# drop empty rows
df = df.dropna()
df = df[df["raw"].str.strip() != ""].reset_index(drop=True)
records = []
for i in range(0, len(df), 2):
try:
name = df.iloc[i]["raw"].strip()
details = df.iloc[i + 1]["raw"].strip()
# find location: assume ends at "India"
loc_match = re.search(r"(.*?India)", details)
if not loc_match:
continue
location = loc_match.group(1).strip()
# split tags vs location
before_loc = details[: loc_match.start()]
tags = [t.strip() for t in before_loc.split(",") if t.strip()]
# now after location is description + rank
after_loc = details[loc_match.end() :]
# last period marks end of description
last_dot = after_loc.rfind(".")
description = after_loc[: last_dot + 1].strip()
rank_str = after_loc[last_dot + 1 :].strip()
try:
rank = int(rank_str)
except:
rank = None
record = {
"name": name,
"tags": tags,
"location": location,
"description": description,
"rank": rank,
}
records.append(record)
except Exception as e:
print(f"Error parsing record {i}: {e}")
continue
return records
# Example usage:
records = parse_company_csv("company_seeds.csv")
pd.DataFrame(records).to_csv("companies_structured.csv", index=False)
with open("companies_structured.json", "w") as f:
json.dump(records, f, indent=2)
# import pandas as pd
# import re
# def parse_company_csv(path):
# df = pd.read_csv(path, header=None, names=["name","skip","details","description","rank"], dtype=str)
# records = []
# for _, row in df.iterrows():
# try:
# name = row["name"].strip()
# details = str(row["details"]).strip()
# description = str(row["description"]).strip()
# rank_str = str(row["rank"]).strip().replace(".0","")
# rank = int(rank_str) if rank_str.isdigit() else None
# # split details into parts
# parts = [p.strip() for p in details.split(",") if p.strip()]
# # assume last 3 parts = city, state, country
# if len(parts) >= 3:
# location = ", ".join(parts[-3:])
# tags = parts[:-3]
# else:
# location = details
# tags = []
# record = {
# "name": name,
# "tags": tags,
# "location": location,
# "description": description,
# "rank": rank
# }
# records.append(record)
# except Exception as e:
# print(f"Error parsing {row}: {e}")
# continue
# return records
# # Example usage:
# # records = parse_company_csv("companies_raw.csv")
# # pd.DataFrame(records).to_csv("companies_structured.csv", index=False)