process_wiki_bz / app.py
SitwalaM
changed stuff for error
11674a6
import gradio as gr
import os
import subprocess
import shutil
import zipfile
import sys
def zip_directory(folder_path, zip_path):
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, _, files in os.walk(folder_path):
for file in files:
full_path = os.path.join(root, file)
arcname = os.path.relpath(full_path, folder_path)
zipf.write(full_path, arcname)
def process_bz2_file(bz2_file):
if bz2_file is None:
return "No file uploaded.", None
input_path = bz2_file.name
file_name = os.path.basename(input_path)
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
# Extract language code
lang_code = file_name.split("wiki")[0]
lang_output_dir = os.path.join(output_dir, lang_code)
os.makedirs(lang_output_dir, exist_ok=True)
# Run WikiExtractor using the current Python interpreter
command = [
sys.executable, "-m", "wikiextractor.WikiExtractor",
input_path,
"-o", lang_output_dir,
"--json"
]
try:
result = subprocess.run(command, check=True, capture_output=True, text=True)
# Zip the output
zip_path = os.path.join(output_dir, f"{lang_code}.zip")
zip_directory(lang_output_dir, zip_path)
result_text = f"✅ Processed {file_name}. Download ready."
return result_text, zip_path
except subprocess.CalledProcessError as e:
error_msg = e.stderr if hasattr(e, 'stderr') and e.stderr else str(e)
return f"❌ Error processing {file_name}:\n{error_msg}", None
# UI
with gr.Blocks() as demo:
gr.Markdown("# WikiExtractor App\nUpload a Wikipedia .bz2 dump and download extracted text.")
file_input = gr.File(label="Upload .bz2 file", file_types=[".bz2"])
output_text = gr.Textbox(label="Status")
download_file = gr.File(label="Download Output ZIP")
file_input.change(
fn=process_bz2_file,
inputs=file_input,
outputs=[output_text, download_file]
)
# Launch
if __name__ == "__main__":
demo.launch()