import os import json import subprocess import requests import gradio as gr from datetime import datetime from huggingface_hub import HfApi, login # Target Dataset Repository REPO_ID = "Mafia2008/Vod" def extract_json_from_http_text(text: str): """Extracts the JSON payload from the raw HTTP capture text.""" try: start_idx = text.find('{"status"') if start_idx == -1: return None body = text[start_idx:] end_idx = body.rfind('}') return json.loads(body[:end_idx+1]) if end_idx != -1 else None except: return None def convert_and_upload_single(api, item, folder_name): """Downloads/converts a single file, uploads it immediately, and cleans up.""" title = item.get("title", "untitled").replace("/", "-").replace("\\", "-").strip() link = item.get("link") duration = item.get("duration", "0") original_date = item.get("date", "") is_video = "VIDEO" in item.get("type", "").upper() or "video" in folder_name.lower() ext = ".mp4" if is_video else ".pdf" local_filename = f"{title}{ext}" local_path = os.path.join("/tmp", local_filename) repo_path = f"{folder_name}/{local_filename}" # Generate the direct download link for Hugging Face hf_direct_link = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{repo_path.replace(' ', '%20')}" try: if is_video: print(f"🎬 Downloading and converting: {title}") # FFmpeg with User-Agent and Whitelist to fix the 728kb error command = [ "ffmpeg", "-y", "-user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "-protocol_whitelist", "file,http,https,tcp,tls,crypto", "-i", link, "-c", "copy", "-bsf:a", "aac_adtstoasc", local_path ] subprocess.run(command, check=True) # Size check: If it's under 1MB, something blocked FFmpeg if os.path.exists(local_path) and os.path.getsize(local_path) < 1024 * 1024: if os.path.exists(local_path): os.remove(local_path) return False, f"❌ Failed {title}: Output file is too small (likely blocked by server).", None else: print(f"📄 Downloading PDF: {title}") with requests.get(link, stream=True) as r: r.raise_for_status() with open(local_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) # Upload the single file to Hugging Face immediately api.upload_file( path_or_fileobj=local_path, path_in_repo=repo_path, repo_id=REPO_ID, repo_type="dataset" ) # Clean up local storage immediately to prevent crashes if os.path.exists(local_path): os.remove(local_path) # Build the metadata record for the final JSON metadata = { "title": title, "type": "VIDEO" if is_video else "PDF", "duration": duration, "original_date": original_date, "upload_date": datetime.utcnow().isoformat() + "Z", "hf_direct_link": hf_direct_link } return True, f"✅ Uploaded: {local_filename}", metadata except Exception as e: if os.path.exists(local_path): os.remove(local_path) return False, f"❌ Failed {title}: {str(e)}", None def start_process(vid_file, not_file): hf_token = os.environ.get("HF_TOKEN") if not hf_token: return "HF_TOKEN missing in Secrets.", "" login(token=hf_token) api = HfApi() with open(vid_file, 'r') as f: vid_data = extract_json_from_http_text(f.read()) with open(not_file, 'r') as f: not_data = extract_json_from_http_text(f.read()) logs = [] final_json_data = { "dataset_repo": REPO_ID, "last_updated": datetime.utcnow().isoformat() + "Z", "videos": [], "notes": [] } # 1. Process and Upload Videos One-by-One if vid_data: logs.append("### 🎥 Processing Videos") for item in vid_data.get('data', {}).get('chapters', []): success, msg, meta = convert_and_upload_single(api, item, "English 12th videos") logs.append(msg) if success and meta: final_json_data["videos"].append(meta) # 2. Process and Upload Notes One-by-One if not_data: logs.append("\n### 📄 Processing Notes") for item in not_data.get('data', {}).get('chapters', []): success, msg, meta = convert_and_upload_single(api, item, "English 12th notes") logs.append(msg) if success and meta: final_json_data["notes"].append(meta) # 3. Save and Upload the Master JSON Index logs.append("\n### 📝 Generating Master JSON File") json_path = "/tmp/index.json" with open(json_path, 'w', encoding='utf-8') as f: json.dump(final_json_data, f, indent=4) try: api.upload_file( path_or_fileobj=json_path, path_in_repo="index.json", repo_id=REPO_ID, repo_type="dataset" ) logs.append("✅ Success: Uploaded index.json to dataset root") except Exception as e: logs.append(f"❌ Failed to upload index.json: {str(e)}") if os.path.exists(json_path): os.remove(json_path) return "Process Finished", "\n".join(logs) # Build the Web UI with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(f"# 🎬 Fast Pipeline Worker for {REPO_ID}") gr.Markdown("Safely converts and uploads videos one-by-one to prevent storage crashes, then generates a master `index.json`.") with gr.Row(): v_in = gr.File(label="Upload Video TXT") n_in = gr.File(label="Upload Notes TXT") btn = gr.Button("Start Conversion & Upload", variant="primary") out = gr.Textbox(label="Status") log = gr.Markdown(label="Execution Logs") btn.click(start_process, [v_in, n_in], [out, log]) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)