Spaces:

Mafia2008
/

Tefvbkk

Sleeping

App Files Files Community

Tefvbkk / app.py

Mafia2008

Update app.py

e29c87e verified 28 days ago

raw

history blame contribute delete

6.24 kB

	import os
	import json
	import subprocess
	import requests
	import gradio as gr
	from datetime import datetime
	from huggingface_hub import HfApi, login

	# Target Dataset Repository
	REPO_ID = "Mafia2008/Vod"

	def extract_json_from_http_text(text: str):
	"""Extracts the JSON payload from the raw HTTP capture text."""
	try:
	start_idx = text.find('{"status"')
	if start_idx == -1: return None
	body = text[start_idx:]
	end_idx = body.rfind('}')
	return json.loads(body[:end_idx+1]) if end_idx != -1 else None
	except:
	return None

	def convert_and_upload_single(api, item, folder_name):
	"""Downloads/converts a single file, uploads it immediately, and cleans up."""
	title = item.get("title", "untitled").replace("/", "-").replace("\\", "-").strip()
	link = item.get("link")
	duration = item.get("duration", "0")
	original_date = item.get("date", "")

	is_video = "VIDEO" in item.get("type", "").upper() or "video" in folder_name.lower()
	ext = ".mp4" if is_video else ".pdf"

	local_filename = f"{title}{ext}"
	local_path = os.path.join("/tmp", local_filename)
	repo_path = f"{folder_name}/{local_filename}"

	# Generate the direct download link for Hugging Face
	hf_direct_link = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{repo_path.replace(' ', '%20')}"

	try:
	if is_video:
	print(f"🎬 Downloading and converting: {title}")
	# FFmpeg with User-Agent and Whitelist to fix the 728kb error
	command = [
	"ffmpeg", "-y",
	"-user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
	"-protocol_whitelist", "file,http,https,tcp,tls,crypto",
	"-i", link,
	"-c", "copy", "-bsf:a", "aac_adtstoasc",
	local_path
	]
	subprocess.run(command, check=True)

	# Size check: If it's under 1MB, something blocked FFmpeg
	if os.path.exists(local_path) and os.path.getsize(local_path) < 1024 * 1024:
	if os.path.exists(local_path): os.remove(local_path)
	return False, f"❌ Failed {title}: Output file is too small (likely blocked by server).", None

	else:
	print(f"📄 Downloading PDF: {title}")
	with requests.get(link, stream=True) as r:
	r.raise_for_status()
	with open(local_path, 'wb') as f:
	for chunk in r.iter_content(chunk_size=8192): f.write(chunk)

	# Upload the single file to Hugging Face immediately
	api.upload_file(
	path_or_fileobj=local_path,
	path_in_repo=repo_path,
	repo_id=REPO_ID,
	repo_type="dataset"
	)

	# Clean up local storage immediately to prevent crashes
	if os.path.exists(local_path): os.remove(local_path)

	# Build the metadata record for the final JSON
	metadata = {
	"title": title,
	"type": "VIDEO" if is_video else "PDF",
	"duration": duration,
	"original_date": original_date,
	"upload_date": datetime.utcnow().isoformat() + "Z",
	"hf_direct_link": hf_direct_link
	}
	return True, f"✅ Uploaded: {local_filename}", metadata

	except Exception as e:
	if os.path.exists(local_path): os.remove(local_path)
	return False, f"❌ Failed {title}: {str(e)}", None

	def start_process(vid_file, not_file):
	hf_token = os.environ.get("HF_TOKEN")
	if not hf_token: return "HF_TOKEN missing in Secrets.", ""
	login(token=hf_token)
	api = HfApi()

	with open(vid_file, 'r') as f: vid_data = extract_json_from_http_text(f.read())
	with open(not_file, 'r') as f: not_data = extract_json_from_http_text(f.read())

	logs = []
	final_json_data = {
	"dataset_repo": REPO_ID,
	"last_updated": datetime.utcnow().isoformat() + "Z",
	"videos": [],
	"notes": []
	}

	# 1. Process and Upload Videos One-by-One
	if vid_data:
	logs.append("### 🎥 Processing Videos")
	for item in vid_data.get('data', {}).get('chapters', []):
	success, msg, meta = convert_and_upload_single(api, item, "English 12th videos")
	logs.append(msg)
	if success and meta:
	final_json_data["videos"].append(meta)

	# 2. Process and Upload Notes One-by-One
	if not_data:
	logs.append("\n### 📄 Processing Notes")
	for item in not_data.get('data', {}).get('chapters', []):
	success, msg, meta = convert_and_upload_single(api, item, "English 12th notes")
	logs.append(msg)
	if success and meta:
	final_json_data["notes"].append(meta)

	# 3. Save and Upload the Master JSON Index
	logs.append("\n### 📝 Generating Master JSON File")
	json_path = "/tmp/index.json"
	with open(json_path, 'w', encoding='utf-8') as f:
	json.dump(final_json_data, f, indent=4)

	try:
	api.upload_file(
	path_or_fileobj=json_path,
	path_in_repo="index.json",
	repo_id=REPO_ID,
	repo_type="dataset"
	)
	logs.append("✅ Success: Uploaded index.json to dataset root")
	except Exception as e:
	logs.append(f"❌ Failed to upload index.json: {str(e)}")

	if os.path.exists(json_path): os.remove(json_path)

	return "Process Finished", "\n".join(logs)

	# Build the Web UI
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(f"# 🎬 Fast Pipeline Worker for {REPO_ID}")
	gr.Markdown("Safely converts and uploads videos one-by-one to prevent storage crashes, then generates a master `index.json`.")

	with gr.Row():
	v_in = gr.File(label="Upload Video TXT")
	n_in = gr.File(label="Upload Notes TXT")

	btn = gr.Button("Start Conversion & Upload", variant="primary")
	out = gr.Textbox(label="Status")
	log = gr.Markdown(label="Execution Logs")

	btn.click(start_process, [v_in, n_in], [out, log])

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)