maiseumsChat / dataset_loader.py
NitinMoturu's picture
Create dataset_loader.py
aab9b18 verified
import os
import pandas as pd
import json
def load_all_json(folder_path="Data", limit=10):
all_dfs = []
for file in os.listdir(folder_path):
if file.endswith(".json"):
file_path = os.path.join(folder_path, file)
try:
# Try array-of-objects
df = pd.read_json(file_path)
except ValueError:
try:
# Try line-delimited JSON
df = pd.read_json(file_path, lines=True)
except ValueError:
try:
# Try single JSON object
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
df = pd.DataFrame([data]) # wrap dict in a list
else:
raise ValueError("Not a dict")
except Exception as e:
print(f"Skipping {file}: {e}")
continue
all_dfs.append(df)
if not all_dfs:
raise ValueError(f"No valid JSON files found in {folder_path}")
df = pd.concat(all_dfs, ignore_index=True)
# Create text field
if "title" in df.columns and "description" in df.columns:
df["text"] = (
df["title"].fillna("")
+ ": "
+ df["description"].fillna("")
+ ": "
+ df.get("url", "").astype(str).fillna("")
)
else:
df["text"] = df.astype(str).agg(" ".join, axis=1)
# Ensure title field exists
if "title" not in df.columns:
df["title"] = df["text"].str[:50] # fallback: first 50 chars
df = df.dropna(subset=["text"])
# ✅ Limit number of rows
if limit:
df = df.head(limit)
return df