Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import json | |
| def load_all_json(folder_path="Data", limit=10): | |
| all_dfs = [] | |
| for file in os.listdir(folder_path): | |
| if file.endswith(".json"): | |
| file_path = os.path.join(folder_path, file) | |
| try: | |
| # Try array-of-objects | |
| df = pd.read_json(file_path) | |
| except ValueError: | |
| try: | |
| # Try line-delimited JSON | |
| df = pd.read_json(file_path, lines=True) | |
| except ValueError: | |
| try: | |
| # Try single JSON object | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| if isinstance(data, dict): | |
| df = pd.DataFrame([data]) # wrap dict in a list | |
| else: | |
| raise ValueError("Not a dict") | |
| except Exception as e: | |
| print(f"Skipping {file}: {e}") | |
| continue | |
| all_dfs.append(df) | |
| if not all_dfs: | |
| raise ValueError(f"No valid JSON files found in {folder_path}") | |
| df = pd.concat(all_dfs, ignore_index=True) | |
| # Create text field | |
| if "title" in df.columns and "description" in df.columns: | |
| df["text"] = ( | |
| df["title"].fillna("") | |
| + ": " | |
| + df["description"].fillna("") | |
| + ": " | |
| + df.get("url", "").astype(str).fillna("") | |
| ) | |
| else: | |
| df["text"] = df.astype(str).agg(" ".join, axis=1) | |
| # Ensure title field exists | |
| if "title" not in df.columns: | |
| df["title"] = df["text"].str[:50] # fallback: first 50 chars | |
| df = df.dropna(subset=["text"]) | |
| # ✅ Limit number of rows | |
| if limit: | |
| df = df.head(limit) | |
| return df | |