import os import pandas as pd import json def load_all_json(folder_path="Data", limit=10): all_dfs = [] for file in os.listdir(folder_path): if file.endswith(".json"): file_path = os.path.join(folder_path, file) try: # Try array-of-objects df = pd.read_json(file_path) except ValueError: try: # Try line-delimited JSON df = pd.read_json(file_path, lines=True) except ValueError: try: # Try single JSON object with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, dict): df = pd.DataFrame([data]) # wrap dict in a list else: raise ValueError("Not a dict") except Exception as e: print(f"Skipping {file}: {e}") continue all_dfs.append(df) if not all_dfs: raise ValueError(f"No valid JSON files found in {folder_path}") df = pd.concat(all_dfs, ignore_index=True) # Create text field if "title" in df.columns and "description" in df.columns: df["text"] = ( df["title"].fillna("") + ": " + df["description"].fillna("") + ": " + df.get("url", "").astype(str).fillna("") ) else: df["text"] = df.astype(str).agg(" ".join, axis=1) # Ensure title field exists if "title" not in df.columns: df["title"] = df["text"].str[:50] # fallback: first 50 chars df = df.dropna(subset=["text"]) # ✅ Limit number of rows if limit: df = df.head(limit) return df