Spaces:

armyneo
/

trailspace

Sleeping

App Files Files Community

armyneo commited on Sep 2, 2025

Commit

303dc05

verified ·

1 Parent(s): 460d131

app.py

Browse files

Files changed (1) hide show

app.py +186 -0

app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# app.py
+# Gradio web app for batch .docx processing:
+# - After the first TAB in each paragraph, strip leading spaces and capitalize first letter (TR-aware).
+# - Search dialogues (optional) and preview changes.
+# - Download ZIP of processed files.
+import os
+import io
+import shutil
+import tempfile
+import zipfile
+from typing import List, Tuple, Dict, Any
+import pandas as pd
+from docx import Document
+import gradio as gr
+# ---------- Text helpers ----------
+def tr_upper_initial(ch: str) -> str:
+    """Turkish-aware upper for a single initial character."""
+    if ch == "i":
+        return "İ"
+    if ch == "ı":
+        return "I"
+    return ch.upper()
+def normalize_delim(delim: str) -> str:
+    """Allow user to type '\\t' for tab, default to real tab."""
+    if delim is None or delim == "":
+        return "\t"
+    if delim == r"\t":
+        return "\t"
+    return delim
+# ---------- Core processors ----------
+def process_paragraph_simple(text: str, delim: str) -> Tuple[str, Dict[str, Any]]:
+    """
+    Non-format-preserving edit using paragraph.text (merges runs).
+    Returns (new_text, change_meta).
+    """
+    if delim not in text:
+        return text, {"changed": False, "left": None, "right_before": None, "right_after": None}
+    left, right = text.split(delim, 1)
+    original_right = right
+    right_stripped = right.lstrip()
+    if right_stripped:
+        first = right_stripped[0]
+        if first.islower():
+            right_stripped = tr_upper_initial(first) + right_stripped[1:]
+    new_text = f"{left}{delim}{right_stripped}"
+    changed = (new_text != text)
+    return new_text, {
+        "changed": changed,
+        "left": left,
+        "right_before": original_right,
+        "right_after": right_stripped
+    }
+def process_document(
+    in_path: str,
+    out_path: str,
+    delim: str = "\t",
+    preserve_runs: bool = False  # kept for future extensibility; current mode is simple
+) -> List[Dict[str, Any]]:
+    """
+    Process a .docx file in-place logic, save to out_path.
+    Returns a list of change records for preview.
+    """
+    doc = Document(in_path)
+    changes = []
+    for idx, para in enumerate(doc.paragraphs):
+        original = para.text
+        new_text, meta = process_paragraph_simple(original, delim)
+        if meta["changed"]:
+            para.text = new_text
+            changes.append({
+                "file": os.path.basename(in_path),
+                "paragraph_index": idx,
+                "before": original,
+                "after": new_text,
+                "left_side": meta["left"],
+                "right_before": meta["right_before"],
+                "right_after": meta["right_after"]
+            })
+    doc.save(out_path)
+    return changes
+# ---------- Gradio callable ----------
+def run_job(
+    files: List[str],
+    search_query: str,
+    delimiter_input: str,
+) -> Tuple[str, pd.DataFrame]:
+    """
+    Gradio interface function.
+    Inputs:
+      - files: list of file paths (.docx)
+      - search_query: optional substring to filter dialogues (case-insensitive) on BEFORE or AFTER text
+      - delimiter_input: "\\t" or a literal string to split dialogue
+    Outputs:
+      - path to ZIP of processed docs
+      - DataFrame with change log (filtered by search if provided)
+    """
+    if not files:
+        return "", pd.DataFrame(columns=["file","paragraph_index","before","after"])
+    delim = normalize_delim(delimiter_input)
+    workdir = tempfile.mkdtemp(prefix="docx_batch_")
+    outdir = os.path.join(workdir, "out")
+    os.makedirs(outdir, exist_ok=True)
+    all_changes = []
+    for fpath in files:
+        if not fpath.lower().endswith(".docx"):
+            continue
+        base = os.path.basename(fpath)
+        root, _ = os.path.splitext(base)
+        out_path = os.path.join(outdir, f"{root}_Capitalized_Strip.docx")
+        changes = process_document(fpath, out_path, delim=delim)
+        all_changes.extend(changes)
+    # Build preview table
+    df = pd.DataFrame(all_changes, columns=[
+        "file", "paragraph_index", "before", "after", "left_side", "right_before", "right_after"
+    ])
+    # Apply search filter if provided (search right_before/right_after plus full before/after)
+    if search_query and not df.empty:
+        q = search_query.lower()
+        mask = (
+            df["before"].str.lower().str.contains(q, na=False) |
+            df["after"].str.lower().str.contains(q, na=False) |
+            df["right_before"].fillna("").str.lower().str.contains(q, na=False) |
+            df["right_after"].fillna("").str.lower().str.contains(q, na=False)
+        )
+        df = df[mask].reset_index(drop=True)
+    # Create ZIP
+    zip_path = os.path.join(workdir, "Processed_Docx.zip")
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
+        for name in os.listdir(outdir):
+            zf.write(os.path.join(outdir, name), arcname=name)
+    return zip_path, df[["file","paragraph_index","before","after"]]
+# ---------- UI ----------
+with gr.Blocks(title="DOCX Dialogue Capitalizer (TR-aware)") as demo:
+    gr.Markdown(
+        "### DOCX Dialogue Capitalizer\n"
+        "- Split at the **first delimiter** (default: TAB), strip leading spaces, then **capitalize the first letter**.\n"
+        "- Designed for Turkish (`i→İ`, `ı→I`).\n"
+        "- Upload multiple `.docx`, optionally **search** results, and **download ZIP**."
+    )
+    with gr.Row():
+        file_in = gr.File(
+            label="Upload .docx files",
+            file_count="multiple",
+            file_types=[".docx"],
+            type="filepath"
+        )
+        delimiter = gr.Textbox(label="Delimiter", value="\\t", info="Use \\t for TAB, or any literal (e.g., '—' or ':').")
+        search = gr.Textbox(label="Search (optional)", placeholder="Substring to filter changed lines…")
+    run_btn = gr.Button("Process")
+    with gr.Row():
+        zip_out = gr.File(label="Download ZIP (processed files)")
+    df_out = gr.Dataframe(
+        label="Preview of Changes / Search Matches",
+        interactive=False,
+        wrap=True,
+        height=400
+    )
+    run_btn.click(fn=run_job, inputs=[file_in, search, delimiter], outputs=[zip_out, df_out])
+if __name__ == "__main__":
+    # For Colab: set share=True to get a public URL
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)