Spaces:

Juna190825
/

karuleste

Sleeping

App Files Files Community

karuleste / app.py

Juna190825

Update app.py

99e1442 verified 6 months ago

raw

history blame contribute delete

9.51 kB

	###############################################################################################################
	###############################################################################################################
	from huggingface_hub import HfApi
	import os
	import time

	token_name = "rules_token"
	token = os.getenv(token_name)
	if not token:
	raise ValueError(f"Missing Hugging Face token. Set '{token_name}' as an environment variable[secret variable in this repo].")
	else:
	print(f"Token is not none!")
	api = HfApi(token=token)
	###############################################################################################################
	###############################################################################################################


	###############################################################################################################
	###############################################################################################################
	from utils import *
	from datasets import load_dataset

	## Comment/Uncomment as required ###
	dataset = load_dataset("opus100", "en-fr", split="train") # 1000000
	# dataset = load_dataset("opus100", "de-en", split="train") # 1000000
	print(len(dataset))
	###############################################################################################################
	###############################################################################################################


	###############################################################################################################
	###############################################################################################################
	# s_rules = load_syntactic_rules()
	# print(len(s_rules))
	# s_rules[('VP', ('VB', 'NP', 'ADVP'))]

	PHRASE_TRANSFORMATION_RULES = {}
	PHRASE_TRANSFORMATION_RULES.update(load_syntactic_rules())

	# @title Retrieving saved rules as for analyzing next batches
	import pandas as pd
	import os

	# Path to your batch folder
	rules_path = "./rules" # rules_path #

	# List all CSV files in the folder
	csv_files = [f for f in os.listdir(rules_path) if f.endswith('.csv')]

	# Sort files if needed (optional)
	csv_files.sort()

	# Load and merge
	merged_df = pd.concat([pd.read_csv(os.path.join(rules_path, f)) for f in csv_files], ignore_index=True)
	merged_df.head()
	# Convert first and second columns to dictionary
	result_dict = dict(zip(merged_df.iloc[:, 0], merged_df.iloc[:, 1]))
	print(f"len(PHRASE_TRANSFORMATION_RULES): {len(PHRASE_TRANSFORMATION_RULES)}")
	PHRASE_TRANSFORMATION_RULES = PHRASE_TRANSFORMATION_RULES \| result_dict
	print(f"len(PHRASE_TRANSFORMATION_RULES): {len(PHRASE_TRANSFORMATION_RULES)}")
	###############################################################################################################
	###############################################################################################################


	###############################################################################################################
	###############################################################################################################
	def compute_next_start_idx():
	next_start_idx = 0
	for f in csv_files:
	print(f)
	last_row = int(f.split("_")[2]) # Adjusted to match filename format
	if last_row > next_start_idx:
	next_start_idx = last_row
	print(f"next_start_idx: {next_start_idx}")
	return next_start_idx
	###############################################################################################################
	###############################################################################################################


	###############################################################################################################
	###############################################################################################################
	def push_to_hf_space(file_path, repo_id, path_in_repo=None, token=None):
	"""
	Uploads a file to a Hugging Face Space repository.

	Args:
	file_path (str): Local path to the file you want to upload.
	repo_id (str): Full repo ID in the format 'username/space-name'.
	path_in_repo (str): Destination path inside the repo (defaults to same as filename).
	token (str): Optional Hugging Face token for private repos.
	"""
	api = HfApi(token=token)
	if path_in_repo is None:
	path_in_repo = file_path.split("/")[-1] # Use filename by default

	api.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=path_in_repo,
	repo_id=repo_id,
	repo_type="space"
	)
	print(f"✅ Uploaded '{file_path}' to '{repo_id}' at '{path_in_repo}'")
	###############################################################################################################
	###############################################################################################################


	###############################################################################################################
	###############################################################################################################
	# Parameters
	batch_size = 1000
	global_idx = 0
	start_idx = compute_next_start_idx()
	end_idx = 1000000
	rules_path = rules_path # '/content/drive/MyDrive/rules_batches'
	# os.makedirs(rules_path, exist_ok=True)

	# Initialize
	batch_count = 0

	for i in range(start_idx, end_idx, batch_size):
	new_rules_dict = {}
	batch = dataset.select(range(i, min(i + batch_size, end_idx)))

	for idx, example in enumerate(batch):
	en = example["translation"]["en"]
	print(f"global_index: {i+idx}")
	print(f"len(PHRASE_TRANSFORMATION_RULES): {len(PHRASE_TRANSFORMATION_RULES)}")
	_, bb = analyze_sentence3(en, PHRASE_TRANSFORMATION_RULES)
	if bb:
	for k, v in bb.items():

	if k not in PHRASE_TRANSFORMATION_RULES:
	global_idx = i + idx
	new_rules_dict[k] = v + f"\| {global_idx} : {en}"

	print(f"new len(new_rules_dict): {len(new_rules_dict)}")
	# Update rules dictionary with clean values
	PHRASE_TRANSFORMATION_RULES.update({k: v for k, v in new_rules_dict.items()})
	print(f"len(PHRASE_TRANSFORMATION_RULES): {len(PHRASE_TRANSFORMATION_RULES)}")

	# Prepare batch data
	rows = []
	for k, v in new_rules_dict.items():
	if ' ... # e.g., ' in v:
	parts = v.split(' ... # e.g., ')
	if len(parts) == 2:
	rows.append({
	'Pattern': k,
	'Transformation': parts[0].strip(),
	'Example': parts[1].strip()
	})

	# Save batch to CSV
	df = pd.DataFrame(rows)
	# batch_file = os.path.join(rules_path, f"OPUS100_{i}_{i + batch_size}_rules_batch_{len(new_rules_dict)}.csv")
	batch_file = os.path.join(f"OPUS100_{i}_{i + batch_size}_rules_batch_{len(new_rules_dict)}.csv")
	df.to_csv(batch_file, index=False)

	# Optional: wait before uploading
	time.sleep(2) # Wait for 2 seconds (adjust as needed)

	# Upload to Hugging Face Space
	push_to_hf_space(
	file_path=batch_file,
	repo_id="Juna190825/karuleste",
	path_in_repo=f"rules/{batch_file}", # Destination path in repo
	token=token # Required if repo is private
	)
	print(f"✅ Saved batch {batch_count} with {len(df)} rules to {batch_file}")

	batch_count += 1
	###############################################################################################################
	###############################################################################################################


	###############################################################################################################
	###############################################################################################################
	import gradio as gr

	# Dummy CSV file list for illustration
	# csv_files = ["data_0.csv", "data_100.csv", "data_250.csv"]


	with gr.Blocks() as demo:
	gr.Markdown("## Compute Next Start Index")
	output_text = gr.Textbox(label="Result", interactive=False)
	compute_button = gr.Button("Compute next_start_idx")

	compute_button.click(fn=compute_next_start_idx, inputs=[], outputs=output_text)


	# import threading

	# def background_task():
	# # while True:
	# print("Running in background...")

	# threading.Thread(target=background_task, daemon=True).start()

	# demo.launch(share=True)
	# demo.launch(ssr_mode=False)
	demo.launch()
	###############################################################################################################
	###############################################################################################################


	###############################################################################################################
	###############################################################################################################
	####### Get the start_idx automatically #######
	next_start_idx = 0
	for f in csv_files:
	print(f)
	last_row = int(f.split("_")[2])
	next_start_idx = last_row if last_row > next_start_idx else next_start_idx
	print(f"next_start_idx: {next_start_idx}")
	###############################################################################################################
	###############################################################################################################