Spaces:
Sleeping
Sleeping
| # app.py | |
| # Gradio web app for batch .docx processing: | |
| # - After the first TAB in each paragraph, strip leading spaces and capitalize first letter (TR-aware). | |
| # - Search dialogues (optional) and preview changes. | |
| # - Download ZIP of processed files. | |
| import os | |
| import io | |
| import shutil | |
| import tempfile | |
| import zipfile | |
| from typing import List, Tuple, Dict, Any | |
| import pandas as pd | |
| from docx import Document | |
| import gradio as gr | |
| # ---------- Text helpers ---------- | |
| def tr_upper_initial(ch: str) -> str: | |
| """Turkish-aware upper for a single initial character.""" | |
| if ch == "i": | |
| return "İ" | |
| if ch == "ı": | |
| return "I" | |
| return ch.upper() | |
| def normalize_delim(delim: str) -> str: | |
| """Allow user to type '\\t' for tab, default to real tab.""" | |
| if delim is None or delim == "": | |
| return "\t" | |
| if delim == r"\t": | |
| return "\t" | |
| return delim | |
| # ---------- Core processors ---------- | |
| def process_paragraph_simple(text: str, delim: str) -> Tuple[str, Dict[str, Any]]: | |
| """ | |
| Non-format-preserving edit using paragraph.text (merges runs). | |
| Returns (new_text, change_meta). | |
| """ | |
| if delim not in text: | |
| return text, {"changed": False, "left": None, "right_before": None, "right_after": None} | |
| left, right = text.split(delim, 1) | |
| original_right = right | |
| right_stripped = right.lstrip() | |
| if right_stripped: | |
| first = right_stripped[0] | |
| if first.islower(): | |
| right_stripped = tr_upper_initial(first) + right_stripped[1:] | |
| new_text = f"{left}{delim}{right_stripped}" | |
| changed = (new_text != text) | |
| return new_text, { | |
| "changed": changed, | |
| "left": left, | |
| "right_before": original_right, | |
| "right_after": right_stripped | |
| } | |
| def process_document( | |
| in_path: str, | |
| out_path: str, | |
| delim: str = "\t", | |
| preserve_runs: bool = False # kept for future extensibility; current mode is simple | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Process a .docx file in-place logic, save to out_path. | |
| Returns a list of change records for preview. | |
| """ | |
| doc = Document(in_path) | |
| changes = [] | |
| for idx, para in enumerate(doc.paragraphs): | |
| original = para.text | |
| new_text, meta = process_paragraph_simple(original, delim) | |
| if meta["changed"]: | |
| para.text = new_text | |
| changes.append({ | |
| "file": os.path.basename(in_path), | |
| "paragraph_index": idx, | |
| "before": original, | |
| "after": new_text, | |
| "left_side": meta["left"], | |
| "right_before": meta["right_before"], | |
| "right_after": meta["right_after"] | |
| }) | |
| doc.save(out_path) | |
| return changes | |
| # ---------- Gradio callable ---------- | |
| def run_job( | |
| files: List[str], | |
| search_query: str, | |
| delimiter_input: str, | |
| ) -> Tuple[str, pd.DataFrame]: | |
| """ | |
| Gradio interface function. | |
| Inputs: | |
| - files: list of file paths (.docx) | |
| - search_query: optional substring to filter dialogues (case-insensitive) on BEFORE or AFTER text | |
| - delimiter_input: "\\t" or a literal string to split dialogue | |
| Outputs: | |
| - path to ZIP of processed docs | |
| - DataFrame with change log (filtered by search if provided) | |
| """ | |
| if not files: | |
| return "", pd.DataFrame(columns=["file","paragraph_index","before","after"]) | |
| delim = normalize_delim(delimiter_input) | |
| workdir = tempfile.mkdtemp(prefix="docx_batch_") | |
| outdir = os.path.join(workdir, "out") | |
| os.makedirs(outdir, exist_ok=True) | |
| all_changes = [] | |
| for fpath in files: | |
| if not fpath.lower().endswith(".docx"): | |
| continue | |
| base = os.path.basename(fpath) | |
| root, _ = os.path.splitext(base) | |
| out_path = os.path.join(outdir, f"{root}_Capitalized_Strip.docx") | |
| changes = process_document(fpath, out_path, delim=delim) | |
| all_changes.extend(changes) | |
| # Build preview table | |
| df = pd.DataFrame(all_changes, columns=[ | |
| "file", "paragraph_index", "before", "after", "left_side", "right_before", "right_after" | |
| ]) | |
| # Apply search filter if provided (search right_before/right_after plus full before/after) | |
| if search_query and not df.empty: | |
| q = search_query.lower() | |
| mask = ( | |
| df["before"].str.lower().str.contains(q, na=False) | | |
| df["after"].str.lower().str.contains(q, na=False) | | |
| df["right_before"].fillna("").str.lower().str.contains(q, na=False) | | |
| df["right_after"].fillna("").str.lower().str.contains(q, na=False) | |
| ) | |
| df = df[mask].reset_index(drop=True) | |
| # Create ZIP | |
| zip_path = os.path.join(workdir, "Processed_Docx.zip") | |
| with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf: | |
| for name in os.listdir(outdir): | |
| zf.write(os.path.join(outdir, name), arcname=name) | |
| return zip_path, df[["file","paragraph_index","before","after"]] | |
| # ---------- UI ---------- | |
| with gr.Blocks(title="DOCX Dialogue Capitalizer (TR-aware)") as demo: | |
| gr.Markdown( | |
| "### DOCX Dialogue Capitalizer\n" | |
| "- Split at the **first delimiter** (default: TAB), strip leading spaces, then **capitalize the first letter**.\n" | |
| "- Designed for Turkish (`i→İ`, `ı→I`).\n" | |
| "- Upload multiple `.docx`, optionally **search** results, and **download ZIP**." | |
| ) | |
| with gr.Row(): | |
| file_in = gr.File( | |
| label="Upload .docx files", | |
| file_count="multiple", | |
| file_types=[".docx"], | |
| type="filepath" | |
| ) | |
| delimiter = gr.Textbox(label="Delimiter", value="\\t", info="Use \\t for TAB, or any literal (e.g., '—' or ':').") | |
| search = gr.Textbox(label="Search (optional)", placeholder="Substring to filter changed lines…") | |
| run_btn = gr.Button("Process") | |
| with gr.Row(): | |
| zip_out = gr.File(label="Download ZIP (processed files)") | |
| df_out = gr.Dataframe( | |
| label="Preview of Changes / Search Matches", | |
| interactive=False, | |
| wrap=True, | |
| max_height=400 # was: height=400 | |
| ) | |
| run_btn.click(fn=run_job, inputs=[file_in, search, delimiter], outputs=[zip_out, df_out]) | |
| if __name__ == "__main__": | |
| # For Colab: set share=True to get a public URL | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False) | |