Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import subprocess | |
| import shutil | |
| import zipfile | |
| import sys | |
| def zip_directory(folder_path, zip_path): | |
| with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
| for root, _, files in os.walk(folder_path): | |
| for file in files: | |
| full_path = os.path.join(root, file) | |
| arcname = os.path.relpath(full_path, folder_path) | |
| zipf.write(full_path, arcname) | |
| def process_bz2_file(bz2_file): | |
| if bz2_file is None: | |
| return "No file uploaded.", None | |
| input_path = bz2_file.name | |
| file_name = os.path.basename(input_path) | |
| output_dir = "output" | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Extract language code | |
| lang_code = file_name.split("wiki")[0] | |
| lang_output_dir = os.path.join(output_dir, lang_code) | |
| os.makedirs(lang_output_dir, exist_ok=True) | |
| # Run WikiExtractor using the current Python interpreter | |
| command = [ | |
| sys.executable, "-m", "wikiextractor.WikiExtractor", | |
| input_path, | |
| "-o", lang_output_dir, | |
| "--json" | |
| ] | |
| try: | |
| result = subprocess.run(command, check=True, capture_output=True, text=True) | |
| # Zip the output | |
| zip_path = os.path.join(output_dir, f"{lang_code}.zip") | |
| zip_directory(lang_output_dir, zip_path) | |
| result_text = f"✅ Processed {file_name}. Download ready." | |
| return result_text, zip_path | |
| except subprocess.CalledProcessError as e: | |
| error_msg = e.stderr if hasattr(e, 'stderr') and e.stderr else str(e) | |
| return f"❌ Error processing {file_name}:\n{error_msg}", None | |
| # UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# WikiExtractor App\nUpload a Wikipedia .bz2 dump and download extracted text.") | |
| file_input = gr.File(label="Upload .bz2 file", file_types=[".bz2"]) | |
| output_text = gr.Textbox(label="Status") | |
| download_file = gr.File(label="Download Output ZIP") | |
| file_input.change( | |
| fn=process_bz2_file, | |
| inputs=file_input, | |
| outputs=[output_text, download_file] | |
| ) | |
| # Launch | |
| if __name__ == "__main__": | |
| demo.launch() | |