armyneo commited on
Commit
303dc05
·
verified ·
1 Parent(s): 460d131
Files changed (1) hide show
  1. app.py +186 -0
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # Gradio web app for batch .docx processing:
3
+ # - After the first TAB in each paragraph, strip leading spaces and capitalize first letter (TR-aware).
4
+ # - Search dialogues (optional) and preview changes.
5
+ # - Download ZIP of processed files.
6
+
7
+ import os
8
+ import io
9
+ import shutil
10
+ import tempfile
11
+ import zipfile
12
+ from typing import List, Tuple, Dict, Any
13
+
14
+ import pandas as pd
15
+ from docx import Document
16
+ import gradio as gr
17
+
18
+ # ---------- Text helpers ----------
19
+
20
+ def tr_upper_initial(ch: str) -> str:
21
+ """Turkish-aware upper for a single initial character."""
22
+ if ch == "i":
23
+ return "İ"
24
+ if ch == "ı":
25
+ return "I"
26
+ return ch.upper()
27
+
28
+ def normalize_delim(delim: str) -> str:
29
+ """Allow user to type '\\t' for tab, default to real tab."""
30
+ if delim is None or delim == "":
31
+ return "\t"
32
+ if delim == r"\t":
33
+ return "\t"
34
+ return delim
35
+
36
+ # ---------- Core processors ----------
37
+
38
+ def process_paragraph_simple(text: str, delim: str) -> Tuple[str, Dict[str, Any]]:
39
+ """
40
+ Non-format-preserving edit using paragraph.text (merges runs).
41
+ Returns (new_text, change_meta).
42
+ """
43
+ if delim not in text:
44
+ return text, {"changed": False, "left": None, "right_before": None, "right_after": None}
45
+ left, right = text.split(delim, 1)
46
+ original_right = right
47
+ right_stripped = right.lstrip()
48
+ if right_stripped:
49
+ first = right_stripped[0]
50
+ if first.islower():
51
+ right_stripped = tr_upper_initial(first) + right_stripped[1:]
52
+ new_text = f"{left}{delim}{right_stripped}"
53
+ changed = (new_text != text)
54
+ return new_text, {
55
+ "changed": changed,
56
+ "left": left,
57
+ "right_before": original_right,
58
+ "right_after": right_stripped
59
+ }
60
+
61
+ def process_document(
62
+ in_path: str,
63
+ out_path: str,
64
+ delim: str = "\t",
65
+ preserve_runs: bool = False # kept for future extensibility; current mode is simple
66
+ ) -> List[Dict[str, Any]]:
67
+ """
68
+ Process a .docx file in-place logic, save to out_path.
69
+ Returns a list of change records for preview.
70
+ """
71
+ doc = Document(in_path)
72
+ changes = []
73
+
74
+ for idx, para in enumerate(doc.paragraphs):
75
+ original = para.text
76
+ new_text, meta = process_paragraph_simple(original, delim)
77
+ if meta["changed"]:
78
+ para.text = new_text
79
+ changes.append({
80
+ "file": os.path.basename(in_path),
81
+ "paragraph_index": idx,
82
+ "before": original,
83
+ "after": new_text,
84
+ "left_side": meta["left"],
85
+ "right_before": meta["right_before"],
86
+ "right_after": meta["right_after"]
87
+ })
88
+
89
+ doc.save(out_path)
90
+ return changes
91
+
92
+ # ---------- Gradio callable ----------
93
+
94
+ def run_job(
95
+ files: List[str],
96
+ search_query: str,
97
+ delimiter_input: str,
98
+ ) -> Tuple[str, pd.DataFrame]:
99
+ """
100
+ Gradio interface function.
101
+ Inputs:
102
+ - files: list of file paths (.docx)
103
+ - search_query: optional substring to filter dialogues (case-insensitive) on BEFORE or AFTER text
104
+ - delimiter_input: "\\t" or a literal string to split dialogue
105
+ Outputs:
106
+ - path to ZIP of processed docs
107
+ - DataFrame with change log (filtered by search if provided)
108
+ """
109
+ if not files:
110
+ return "", pd.DataFrame(columns=["file","paragraph_index","before","after"])
111
+
112
+ delim = normalize_delim(delimiter_input)
113
+
114
+ workdir = tempfile.mkdtemp(prefix="docx_batch_")
115
+ outdir = os.path.join(workdir, "out")
116
+ os.makedirs(outdir, exist_ok=True)
117
+
118
+ all_changes = []
119
+ for fpath in files:
120
+ if not fpath.lower().endswith(".docx"):
121
+ continue
122
+ base = os.path.basename(fpath)
123
+ root, _ = os.path.splitext(base)
124
+ out_path = os.path.join(outdir, f"{root}_Capitalized_Strip.docx")
125
+ changes = process_document(fpath, out_path, delim=delim)
126
+ all_changes.extend(changes)
127
+
128
+ # Build preview table
129
+ df = pd.DataFrame(all_changes, columns=[
130
+ "file", "paragraph_index", "before", "after", "left_side", "right_before", "right_after"
131
+ ])
132
+
133
+ # Apply search filter if provided (search right_before/right_after plus full before/after)
134
+ if search_query and not df.empty:
135
+ q = search_query.lower()
136
+ mask = (
137
+ df["before"].str.lower().str.contains(q, na=False) |
138
+ df["after"].str.lower().str.contains(q, na=False) |
139
+ df["right_before"].fillna("").str.lower().str.contains(q, na=False) |
140
+ df["right_after"].fillna("").str.lower().str.contains(q, na=False)
141
+ )
142
+ df = df[mask].reset_index(drop=True)
143
+
144
+ # Create ZIP
145
+ zip_path = os.path.join(workdir, "Processed_Docx.zip")
146
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
147
+ for name in os.listdir(outdir):
148
+ zf.write(os.path.join(outdir, name), arcname=name)
149
+
150
+ return zip_path, df[["file","paragraph_index","before","after"]]
151
+
152
+ # ---------- UI ----------
153
+
154
+ with gr.Blocks(title="DOCX Dialogue Capitalizer (TR-aware)") as demo:
155
+ gr.Markdown(
156
+ "### DOCX Dialogue Capitalizer\n"
157
+ "- Split at the **first delimiter** (default: TAB), strip leading spaces, then **capitalize the first letter**.\n"
158
+ "- Designed for Turkish (`i→İ`, `ı→I`).\n"
159
+ "- Upload multiple `.docx`, optionally **search** results, and **download ZIP**."
160
+ )
161
+
162
+ with gr.Row():
163
+ file_in = gr.File(
164
+ label="Upload .docx files",
165
+ file_count="multiple",
166
+ file_types=[".docx"],
167
+ type="filepath"
168
+ )
169
+ delimiter = gr.Textbox(label="Delimiter", value="\\t", info="Use \\t for TAB, or any literal (e.g., '—' or ':').")
170
+ search = gr.Textbox(label="Search (optional)", placeholder="Substring to filter changed lines…")
171
+
172
+ run_btn = gr.Button("Process")
173
+ with gr.Row():
174
+ zip_out = gr.File(label="Download ZIP (processed files)")
175
+ df_out = gr.Dataframe(
176
+ label="Preview of Changes / Search Matches",
177
+ interactive=False,
178
+ wrap=True,
179
+ height=400
180
+ )
181
+
182
+ run_btn.click(fn=run_job, inputs=[file_in, search, delimiter], outputs=[zip_out, df_out])
183
+
184
+ if __name__ == "__main__":
185
+ # For Colab: set share=True to get a public URL
186
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)