###############################################################################################################
###############################################################################################################
from huggingface_hub import HfApi
import os
import time

token_name = "rules_token"
token = os.getenv(token_name)
if not token:
    raise ValueError(f"Missing Hugging Face token. Set '{token_name}' as an environment variable[secret variable in this repo].")
else:
    print(f"Token is not none!")
api = HfApi(token=token)
###############################################################################################################
###############################################################################################################


###############################################################################################################
###############################################################################################################
from utils import *
from datasets import load_dataset

## Comment/Uncomment as required ###
dataset = load_dataset("opus100", "en-fr", split="train") # 1000000
# dataset = load_dataset("opus100", "de-en", split="train") # 1000000
print(len(dataset))
###############################################################################################################
###############################################################################################################


###############################################################################################################
###############################################################################################################
# s_rules = load_syntactic_rules()
# print(len(s_rules))
# s_rules[('VP', ('VB', 'NP', 'ADVP'))]

PHRASE_TRANSFORMATION_RULES = {}
PHRASE_TRANSFORMATION_RULES.update(load_syntactic_rules())

# @title Retrieving saved rules as for analyzing next batches
import pandas as pd
import os

# Path to your batch folder
rules_path = "./rules" # rules_path #

# List all CSV files in the folder
csv_files = [f for f in os.listdir(rules_path) if f.endswith('.csv')]

# Sort files if needed (optional)
csv_files.sort()

# Load and merge
merged_df = pd.concat([pd.read_csv(os.path.join(rules_path, f)) for f in csv_files], ignore_index=True)
merged_df.head()
# Convert first and second columns to dictionary
result_dict = dict(zip(merged_df.iloc[:, 0], merged_df.iloc[:, 1]))
print(f"len(PHRASE_TRANSFORMATION_RULES): {len(PHRASE_TRANSFORMATION_RULES)}")
PHRASE_TRANSFORMATION_RULES = PHRASE_TRANSFORMATION_RULES | result_dict
print(f"len(PHRASE_TRANSFORMATION_RULES): {len(PHRASE_TRANSFORMATION_RULES)}")
###############################################################################################################
###############################################################################################################


###############################################################################################################
###############################################################################################################
def compute_next_start_idx():
    next_start_idx = 0
    for f in csv_files:
        print(f)
        last_row = int(f.split("_")[2])  # Adjusted to match filename format
        if last_row > next_start_idx:
            next_start_idx = last_row
    print(f"next_start_idx: {next_start_idx}")
    return next_start_idx
###############################################################################################################
###############################################################################################################


###############################################################################################################
###############################################################################################################
def push_to_hf_space(file_path, repo_id, path_in_repo=None, token=None):
    """
    Uploads a file to a Hugging Face Space repository.

    Args:
        file_path (str): Local path to the file you want to upload.
        repo_id (str): Full repo ID in the format 'username/space-name'.
        path_in_repo (str): Destination path inside the repo (defaults to same as filename).
        token (str): Optional Hugging Face token for private repos.
    """
    api = HfApi(token=token)
    if path_in_repo is None:
        path_in_repo = file_path.split("/")[-1]  # Use filename by default

    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=path_in_repo,
        repo_id=repo_id,
        repo_type="space"
    )
    print(f"✅ Uploaded '{file_path}' to '{repo_id}' at '{path_in_repo}'")
###############################################################################################################
###############################################################################################################


###############################################################################################################
###############################################################################################################
# Parameters
batch_size = 1000
global_idx = 0
start_idx = compute_next_start_idx()
end_idx = 1000000
rules_path = rules_path # '/content/drive/MyDrive/rules_batches'
# os.makedirs(rules_path, exist_ok=True)

# Initialize
batch_count = 0

for i in range(start_idx, end_idx, batch_size):
    new_rules_dict = {}
    batch = dataset.select(range(i, min(i + batch_size, end_idx)))

    for idx, example in enumerate(batch):
        en = example["translation"]["en"]
        print(f"global_index: {i+idx}")
        print(f"len(PHRASE_TRANSFORMATION_RULES): {len(PHRASE_TRANSFORMATION_RULES)}")
        _, bb = analyze_sentence3(en, PHRASE_TRANSFORMATION_RULES)
        if bb:
            for k, v in bb.items():
                
                if k not in PHRASE_TRANSFORMATION_RULES:
                    global_idx = i + idx
                    new_rules_dict[k] = v + f"| {global_idx} : {en}"

    print(f"new len(new_rules_dict): {len(new_rules_dict)}")
    # Update rules dictionary with clean values
    PHRASE_TRANSFORMATION_RULES.update({k: v for k, v in new_rules_dict.items()})
    print(f"len(PHRASE_TRANSFORMATION_RULES): {len(PHRASE_TRANSFORMATION_RULES)}")

    # Prepare batch data
    rows = []
    for k, v in new_rules_dict.items():
        if ' ...   # e.g., ' in v:
            parts = v.split(' ...   # e.g., ')
            if len(parts) == 2:
                rows.append({
                    'Pattern': k,
                    'Transformation': parts[0].strip(),
                    'Example': parts[1].strip()
                })

    # Save batch to CSV
    df = pd.DataFrame(rows)
    # batch_file = os.path.join(rules_path, f"OPUS100_{i}_{i + batch_size}_rules_batch_{len(new_rules_dict)}.csv")
    batch_file = os.path.join(f"OPUS100_{i}_{i + batch_size}_rules_batch_{len(new_rules_dict)}.csv")
    df.to_csv(batch_file, index=False)

    # Optional: wait before uploading
    time.sleep(2)  # Wait for 2 seconds (adjust as needed)

    # Upload to Hugging Face Space
    push_to_hf_space(
        file_path=batch_file,
        repo_id="Juna190825/karuleste",
        path_in_repo=f"rules/{batch_file}",  # Destination path in repo
        token=token  # Required if repo is private
    )
    print(f"✅ Saved batch {batch_count} with {len(df)} rules to {batch_file}")

    batch_count += 1
###############################################################################################################
###############################################################################################################


###############################################################################################################
###############################################################################################################
import gradio as gr

# Dummy CSV file list for illustration
# csv_files = ["data_0.csv", "data_100.csv", "data_250.csv"]


with gr.Blocks() as demo:
    gr.Markdown("## Compute Next Start Index")
    output_text = gr.Textbox(label="Result", interactive=False)
    compute_button = gr.Button("Compute next_start_idx")

    compute_button.click(fn=compute_next_start_idx, inputs=[], outputs=output_text)


# import threading

# def background_task():
#     # while True:
#     print("Running in background...")

# threading.Thread(target=background_task, daemon=True).start()

# demo.launch(share=True)
# demo.launch(ssr_mode=False)
demo.launch()
###############################################################################################################
###############################################################################################################


###############################################################################################################
###############################################################################################################
####### Get the start_idx automatically #######
next_start_idx = 0
for f in csv_files:
    print(f)
    last_row = int(f.split("_")[2])
    next_start_idx = last_row if last_row > next_start_idx else next_start_idx
print(f"next_start_idx: {next_start_idx}")
###############################################################################################################
###############################################################################################################