import os
import pandas as pd
import json

def load_all_json(folder_path="Data", limit=10):
    all_dfs = []
    for file in os.listdir(folder_path):
        if file.endswith(".json"):
            file_path = os.path.join(folder_path, file)
            try:
                # Try array-of-objects
                df = pd.read_json(file_path)
            except ValueError:
                try:
                    # Try line-delimited JSON
                    df = pd.read_json(file_path, lines=True)
                except ValueError:
                    try:
                        # Try single JSON object
                        with open(file_path, "r", encoding="utf-8") as f:
                            data = json.load(f)
                        if isinstance(data, dict):
                            df = pd.DataFrame([data])   # wrap dict in a list
                        else:
                            raise ValueError("Not a dict")
                    except Exception as e:
                        print(f"Skipping {file}: {e}")
                        continue

            all_dfs.append(df)

    if not all_dfs:
        raise ValueError(f"No valid JSON files found in {folder_path}")

    df = pd.concat(all_dfs, ignore_index=True)

    # Create text field
    if "title" in df.columns and "description" in df.columns:
        df["text"] = (
            df["title"].fillna("")
            + ": "
            + df["description"].fillna("")
            + ": "
            + df.get("url", "").astype(str).fillna("")
        )
    else:
        df["text"] = df.astype(str).agg(" ".join, axis=1)

    # Ensure title field exists
    if "title" not in df.columns:
        df["title"] = df["text"].str[:50]  # fallback: first 50 chars

    df = df.dropna(subset=["text"])

    # ✅ Limit number of rows
    if limit:
        df = df.head(limit)

    return df