Spaces:
Runtime error
Runtime error
| import json | |
| import pandas as pd | |
| import re | |
| def parse_company_csv(path): | |
| # read raw file (it may not be a valid csv, so read as single column) | |
| df = pd.read_csv(path, header=None, names=["raw"], dtype=str) | |
| # drop empty rows | |
| df = df.dropna() | |
| df = df[df["raw"].str.strip() != ""].reset_index(drop=True) | |
| records = [] | |
| for i in range(0, len(df), 2): | |
| try: | |
| name = df.iloc[i]["raw"].strip() | |
| details = df.iloc[i + 1]["raw"].strip() | |
| # find location: assume ends at "India" | |
| loc_match = re.search(r"(.*?India)", details) | |
| if not loc_match: | |
| continue | |
| location = loc_match.group(1).strip() | |
| # split tags vs location | |
| before_loc = details[: loc_match.start()] | |
| tags = [t.strip() for t in before_loc.split(",") if t.strip()] | |
| # now after location is description + rank | |
| after_loc = details[loc_match.end() :] | |
| # last period marks end of description | |
| last_dot = after_loc.rfind(".") | |
| description = after_loc[: last_dot + 1].strip() | |
| rank_str = after_loc[last_dot + 1 :].strip() | |
| try: | |
| rank = int(rank_str) | |
| except: | |
| rank = None | |
| record = { | |
| "name": name, | |
| "tags": tags, | |
| "location": location, | |
| "description": description, | |
| "rank": rank, | |
| } | |
| records.append(record) | |
| except Exception as e: | |
| print(f"Error parsing record {i}: {e}") | |
| continue | |
| return records | |
| # Example usage: | |
| records = parse_company_csv("company_seeds.csv") | |
| pd.DataFrame(records).to_csv("companies_structured.csv", index=False) | |
| with open("companies_structured.json", "w") as f: | |
| json.dump(records, f, indent=2) | |
| # import pandas as pd | |
| # import re | |
| # def parse_company_csv(path): | |
| # df = pd.read_csv(path, header=None, names=["name","skip","details","description","rank"], dtype=str) | |
| # records = [] | |
| # for _, row in df.iterrows(): | |
| # try: | |
| # name = row["name"].strip() | |
| # details = str(row["details"]).strip() | |
| # description = str(row["description"]).strip() | |
| # rank_str = str(row["rank"]).strip().replace(".0","") | |
| # rank = int(rank_str) if rank_str.isdigit() else None | |
| # # split details into parts | |
| # parts = [p.strip() for p in details.split(",") if p.strip()] | |
| # # assume last 3 parts = city, state, country | |
| # if len(parts) >= 3: | |
| # location = ", ".join(parts[-3:]) | |
| # tags = parts[:-3] | |
| # else: | |
| # location = details | |
| # tags = [] | |
| # record = { | |
| # "name": name, | |
| # "tags": tags, | |
| # "location": location, | |
| # "description": description, | |
| # "rank": rank | |
| # } | |
| # records.append(record) | |
| # except Exception as e: | |
| # print(f"Error parsing {row}: {e}") | |
| # continue | |
| # return records | |
| # # Example usage: | |
| # # records = parse_company_csv("companies_raw.csv") | |
| # # pd.DataFrame(records).to_csv("companies_structured.csv", index=False) | |