import gradio as gr import os import subprocess import shutil import zipfile import sys def zip_directory(folder_path, zip_path): with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, _, files in os.walk(folder_path): for file in files: full_path = os.path.join(root, file) arcname = os.path.relpath(full_path, folder_path) zipf.write(full_path, arcname) def process_bz2_file(bz2_file): if bz2_file is None: return "No file uploaded.", None input_path = bz2_file.name file_name = os.path.basename(input_path) output_dir = "output" os.makedirs(output_dir, exist_ok=True) # Extract language code lang_code = file_name.split("wiki")[0] lang_output_dir = os.path.join(output_dir, lang_code) os.makedirs(lang_output_dir, exist_ok=True) # Run WikiExtractor using the current Python interpreter command = [ sys.executable, "-m", "wikiextractor.WikiExtractor", input_path, "-o", lang_output_dir, "--json" ] try: result = subprocess.run(command, check=True, capture_output=True, text=True) # Zip the output zip_path = os.path.join(output_dir, f"{lang_code}.zip") zip_directory(lang_output_dir, zip_path) result_text = f"✅ Processed {file_name}. Download ready." return result_text, zip_path except subprocess.CalledProcessError as e: error_msg = e.stderr if hasattr(e, 'stderr') and e.stderr else str(e) return f"❌ Error processing {file_name}:\n{error_msg}", None # UI with gr.Blocks() as demo: gr.Markdown("# WikiExtractor App\nUpload a Wikipedia .bz2 dump and download extracted text.") file_input = gr.File(label="Upload .bz2 file", file_types=[".bz2"]) output_text = gr.Textbox(label="Status") download_file = gr.File(label="Download Output ZIP") file_input.change( fn=process_bz2_file, inputs=file_input, outputs=[output_text, download_file] ) # Launch if __name__ == "__main__": demo.launch()