ashkoff commited on
Commit
aea562e
·
1 Parent(s): e910624
Files changed (2) hide show
  1. .gitignore +4 -0
  2. app-ref.py +0 -358
.gitignore CHANGED
@@ -4,3 +4,7 @@ uv.lock
4
  .python-version
5
  __pycache__
6
  *.log
 
 
 
 
 
4
  .python-version
5
  __pycache__
6
  *.log
7
+ .vscode
8
+ .cursor
9
+ .idml
10
+ .csv
app-ref.py DELETED
@@ -1,358 +0,0 @@
1
- from dotenv import load_dotenv
2
- import os
3
- from datetime import datetime
4
- from simple_idml import idml
5
- import re
6
- import shutil
7
- import tempfile
8
- import json
9
- import gradio as gr
10
- from langchain_google_genai import ChatGoogleGenerativeAI
11
- from langchain_core.output_parsers import JsonOutputParser
12
- from langchain_core.prompts import PromptTemplate
13
- import docx2txt
14
-
15
- # imports
16
-
17
- load_dotenv()
18
- os.getenv("GOOGLE_API_KEY")
19
-
20
-
21
- def read_docx(path):
22
-
23
- text = docx2txt.process(path)
24
- return text
25
-
26
-
27
- def process_transcription(file, marketing_prompt, metrics_prompt):
28
- # Read the uploaded file
29
- transcription_text = read_docx(file)
30
- # Initialize parser and model
31
- parser = JsonOutputParser()
32
- model = ChatGoogleGenerativeAI(
33
- model="gemini-2.0-flash",
34
- temperature=0,
35
- max_tokens=None,
36
- timeout=None,
37
- max_retries=2,
38
- )
39
-
40
- # Process marketing copy
41
- marketing_prompt_template = PromptTemplate(
42
- template="""Transform the following transcription into engaging marketing copy.
43
- Follow these guidelines only and only give the body text: {marketing_prompt}
44
-
45
- Transcription:
46
- {transcription_text}
47
-
48
- Generate marketing copy that is compelling and aligned with the provided guidelines.
49
- Focus on key benefits, unique selling points, and engaging narrative.""",
50
- input_variables=["transcription_text", "marketing_prompt"],
51
- )
52
-
53
- marketing_chain = marketing_prompt_template | model
54
- marketing_result = marketing_chain.invoke(
55
- {"transcription_text": transcription_text, "marketing_prompt": marketing_prompt}
56
- )
57
-
58
- # Process project metrics
59
- metrics_prompt_template = PromptTemplate(
60
- template="""Extract project metrics and statistics from the following transcription.
61
- Focus on these aspects: metrics should only and only be the in proper format avoid adding any description or other things if there is nothing can be found do not put in the output
62
- for consultants and and project team only and only output a result if there is a first name and last or a entity name can be found. avoid general name such as structural consultants, lighting consultant etc. outputs should be a list of strings for the names
63
- only use these keys when possible and relevant location, project_name, size, height, number_of_floors, completion_date, client_name, project_team_members, external_consultants
64
- {metrics_prompt}
65
-
66
-
67
- Transcription:
68
- {transcription_text}
69
-
70
-
71
- Generate a JSON object containing the extracted metrics and statistics.
72
- Be specific and quantitative where possible.""",
73
- input_variables=["transcription_text", "metrics_prompt"],
74
- )
75
-
76
- metrics_chain = metrics_prompt_template | model | parser
77
- metrics_result = metrics_chain.invoke(
78
- {"transcription_text": transcription_text, "metrics_prompt": metrics_prompt}
79
- )
80
-
81
- # Format metrics result for display
82
-
83
- metrics_result["description"] = marketing_result.content
84
- # formatted_metrics = json.dumps(metrics_result, indent=2)
85
-
86
- return metrics_result
87
-
88
-
89
- def find_story_files(idml_package, tag_patterns):
90
- """
91
- Find story files containing specific tags
92
-
93
- Args:
94
- idml_package: The IDML package
95
- tag_patterns: List of tag patterns to search for
96
-
97
- Returns:
98
- dict: Mapping of tag patterns to story files
99
- """
100
- compiled_patterns = {pattern: re.compile(pattern) for pattern in tag_patterns}
101
-
102
- tag_to_story = {pattern: [] for pattern in tag_patterns}
103
-
104
- stories = [name for name in idml_package.namelist() if name.startswith("Stories/")]
105
-
106
- for story_path in stories:
107
- try:
108
- content = idml_package.open(story_path).read().decode("utf-8")
109
- for pattern, regex in compiled_patterns.items():
110
- if regex.search(content):
111
- tag_to_story[pattern].append(story_path)
112
- except Exception as e:
113
- print(f"Error reading {story_path}: {e}")
114
-
115
- return tag_to_story
116
-
117
-
118
- def replace_content(xml_content, tag_pattern, replacements):
119
- """
120
- Replace content tags with actual data
121
-
122
- Args:
123
- xml_content: The XML content to modify
124
- tag_pattern: The regex pattern to match tags
125
- replacements: List of replacement values
126
-
127
- Returns:
128
- str: Updated XML content
129
- """
130
- tags = re.finditer(tag_pattern, xml_content)
131
- tag_positions = [(m.start(), m.end()) for m in tags]
132
-
133
- if not tag_positions:
134
- return xml_content
135
-
136
- content_chars = list(xml_content)
137
-
138
- for i, (start, end) in enumerate(reversed(tag_positions)):
139
- index = len(tag_positions) - 1 - i # Reverse index
140
-
141
- if index < len(replacements):
142
- # Replace with actual data
143
- new_content = f"<Content>{replacements[index]}</Content>"
144
- content_chars[start:end] = new_content
145
- else:
146
- br_pattern = r"\s*<Br />"
147
- br_match = re.search(br_pattern, "".join(content_chars[end : end + 20]))
148
- if br_match:
149
- del content_chars[start : end + br_match.end()]
150
- else:
151
- del content_chars[start:end]
152
-
153
- if len(replacements) > len(tag_positions) and tag_positions:
154
- last_pos = tag_positions[-1][1]
155
-
156
- for item in replacements[len(tag_positions) :]:
157
- insert_content = f"\n<Content>{item}</Content>\n<Br />"
158
- content_chars.insert(last_pos, insert_content)
159
- last_pos += len(insert_content)
160
-
161
- return "".join(content_chars)
162
-
163
-
164
- def update_idml_content(idml_path, replacements_json):
165
- """
166
- Update IDML content with replacements from JSON
167
-
168
- Args:
169
- idml_path: Path to the IDML file
170
- replacements_json: JSON string or dict with tag patterns and replacements
171
-
172
- Returns:
173
- str: Path to the updated IDML file
174
- """
175
- # Parse JSON if it's a string
176
- if isinstance(replacements_json, str):
177
- replacements = json.loads(replacements_json)
178
- else:
179
- replacements = replacements_json
180
-
181
- # Get the directory where app.py is located
182
- app_dir = os.path.dirname(os.path.abspath(__file__))
183
-
184
- # Create a temporary directory
185
- with tempfile.TemporaryDirectory() as temp_dir:
186
- # Create a copy of the IDML file to work with
187
- temp_idml = os.path.join(temp_dir, "temp.idml")
188
- shutil.copy2(idml_path, temp_idml)
189
-
190
- with idml.IDMLPackage(temp_idml) as working_idml:
191
- # Find all story files containing our tags
192
- tag_patterns = list(replacements.keys())
193
- tag_to_story = find_story_files(working_idml, tag_patterns)
194
-
195
- # Extract the IDML
196
- extract_dir = os.path.join(temp_dir, "extracted")
197
- os.makedirs(extract_dir, exist_ok=True)
198
- working_idml.extractall(extract_dir)
199
-
200
- # Process each tag pattern
201
- for tag_pattern, replacement_values in replacements.items():
202
- story_files = tag_to_story.get(tag_pattern, [])
203
-
204
- if not story_files:
205
- print(
206
- f"Warning: No story files found containing pattern '{tag_pattern}'"
207
- )
208
- continue
209
-
210
- print(
211
- f"Found pattern '{tag_pattern}' in {len(story_files)} story file(s)"
212
- )
213
-
214
- # Update each story file containing this tag
215
- for story_path in story_files:
216
- # Read the XML content
217
- with open(
218
- os.path.join(extract_dir, story_path), "r", encoding="utf-8"
219
- ) as f:
220
- xml_content = f.read()
221
-
222
- # Update the content
223
- updated_content = replace_content(
224
- xml_content, tag_pattern, replacement_values
225
- )
226
-
227
- # Write back the updated content
228
- with open(
229
- os.path.join(extract_dir, story_path), "w", encoding="utf-8"
230
- ) as f:
231
- f.write(updated_content)
232
-
233
- # Create the output path in the same directory as app.py
234
- base_name = os.path.splitext(os.path.basename(idml_path))[0]
235
- output_filename = (
236
- f"{base_name}_filled_{datetime.now().strftime('%Y%m%d%H%M%S')}.idml"
237
- )
238
- output_path = os.path.join(app_dir, output_filename)
239
-
240
- # Create a new IDML with the updated content
241
- shutil.make_archive(output_path, "zip", extract_dir)
242
- os.rename(output_path + ".zip", output_path)
243
-
244
- print(f"Updated IDML saved to: {output_path}")
245
- return output_path
246
-
247
-
248
- def create_replacements_from_metrics(metrics_data):
249
- """
250
- Convert metrics data to the replacements dictionary format
251
-
252
- Args:
253
- metrics_data: Dictionary containing project metrics
254
-
255
- Returns:
256
- dict: Mapping of tag patterns to replacement values
257
- """
258
- # Define mappings between metrics keys and IDML tag patterns
259
- replacements = {
260
- # Project Description
261
- r"<Content>&lt;Description&gt;</Content>": [
262
- metrics_data.get("description", "")
263
- ],
264
- # Project name
265
- r"<Content>&lt;Project Name&gt;</Content>": [
266
- metrics_data.get("project_name", "")
267
- ],
268
- # Location
269
- r"<Content>&lt;Location&gt;</Content>": [metrics_data.get("location", "")],
270
- # Size/Area
271
- r"<Content>&lt;Area&gt; SF</Content>": [metrics_data.get("size", "")],
272
- # Number of floors
273
- r"<Content>&lt;NumFloors&gt;</Content>": [
274
- metrics_data.get("number_of_floors", "")
275
- ],
276
- # Completion date
277
- r"<Content>&lt;DateComplete&gt; \(&lt;Phase&gt;\)</Content>": [
278
- f"{metrics_data.get('completion_date', '')}"
279
- ],
280
- # Client
281
- r"<Content>&lt;Client&gt;</Content>": [metrics_data.get("client_name", "")],
282
- # Team members - format each with a placeholder role
283
- r"<Content>&lt;TEAM\d+&gt; \(&lt;Role\d+&gt;\)</Content>": [
284
- f"{member} " for member in metrics_data.get("project_team_members", [])
285
- ],
286
- # Consultants
287
- r"<Content>&lt;Consultant\d+&gt;</Content>": [
288
- consultant for consultant in metrics_data.get("external_consultants", [])
289
- ],
290
- }
291
-
292
- return replacements
293
-
294
-
295
- def process_and_update_idml(file, marketing_prompt, metrics_prompt, idml_path):
296
- try:
297
- # Ensure file has an extension
298
- if not os.path.splitext(file.name)[1]:
299
- temp_file = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
300
- shutil.copy2(file.name, temp_file.name)
301
- file = temp_file.name
302
-
303
- # Process the transcription
304
- results = process_transcription(file, marketing_prompt, metrics_prompt)
305
-
306
- # Ensure IDML file has extension
307
- if not os.path.splitext(idml_path.name)[1]:
308
- temp_idml = tempfile.NamedTemporaryFile(suffix=".idml", delete=False)
309
- shutil.copy2(idml_path.name, temp_idml.name)
310
- idml_path = temp_idml.name
311
-
312
- # Prepare the replacements dictionary
313
- replacements = create_replacements_from_metrics(results)
314
-
315
- # Update the IDML file
316
- output_path = update_idml_content(idml_path, replacements)
317
-
318
- return (
319
- f"Successfully processed and updated IDML. Output saved to: {output_path}",
320
- json.dumps(results, indent=2),
321
- output_path,
322
- )
323
- except Exception as e:
324
- error_json = {
325
- "error": str(e),
326
- "description": "An error occurred during processing",
327
- }
328
- return f"Error: {str(e)}", json.dumps(error_json, indent=2), None
329
-
330
-
331
- # Create Gradio interface
332
- iface = gr.Interface(
333
- fn=process_and_update_idml,
334
- inputs=[
335
- gr.File(label="Upload Transcription File (DOCX)", file_types=[".docx"]),
336
- gr.Textbox(
337
- label="Marketing Prompt",
338
- value="create short paragraph with friendly tone focusing on the sustainability aspects of the project",
339
- lines=3,
340
- ),
341
- gr.Textbox(
342
- label="Metrics Prompt",
343
- value="extract project name, location, Size in square feet, number of floors, total height, completion date, client name, project team members name and any external consultants",
344
- lines=3,
345
- ),
346
- gr.File(label="Upload indesign template (idml)", file_types=[".idml"]),
347
- ],
348
- outputs=[
349
- gr.Textbox(label="IDML Update Status", lines=2),
350
- gr.JSON(label="Transcription Results"),
351
- gr.File(label="Download Updated IDML"),
352
- ],
353
- title="Marketing Transcription Processor",
354
- description="Upload a transcription file and IDML template to generate marketing content and update the IDML file.",
355
- )
356
-
357
-
358
- iface.launch()