Spaces:
Sleeping
Sleeping
updates
Browse files- .gitignore +4 -0
- app-ref.py +0 -358
.gitignore
CHANGED
|
@@ -4,3 +4,7 @@ uv.lock
|
|
| 4 |
.python-version
|
| 5 |
__pycache__
|
| 6 |
*.log
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
.python-version
|
| 5 |
__pycache__
|
| 6 |
*.log
|
| 7 |
+
.vscode
|
| 8 |
+
.cursor
|
| 9 |
+
.idml
|
| 10 |
+
.csv
|
app-ref.py
DELETED
|
@@ -1,358 +0,0 @@
|
|
| 1 |
-
from dotenv import load_dotenv
|
| 2 |
-
import os
|
| 3 |
-
from datetime import datetime
|
| 4 |
-
from simple_idml import idml
|
| 5 |
-
import re
|
| 6 |
-
import shutil
|
| 7 |
-
import tempfile
|
| 8 |
-
import json
|
| 9 |
-
import gradio as gr
|
| 10 |
-
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 11 |
-
from langchain_core.output_parsers import JsonOutputParser
|
| 12 |
-
from langchain_core.prompts import PromptTemplate
|
| 13 |
-
import docx2txt
|
| 14 |
-
|
| 15 |
-
# imports
|
| 16 |
-
|
| 17 |
-
load_dotenv()
|
| 18 |
-
os.getenv("GOOGLE_API_KEY")
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
def read_docx(path):
|
| 22 |
-
|
| 23 |
-
text = docx2txt.process(path)
|
| 24 |
-
return text
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def process_transcription(file, marketing_prompt, metrics_prompt):
|
| 28 |
-
# Read the uploaded file
|
| 29 |
-
transcription_text = read_docx(file)
|
| 30 |
-
# Initialize parser and model
|
| 31 |
-
parser = JsonOutputParser()
|
| 32 |
-
model = ChatGoogleGenerativeAI(
|
| 33 |
-
model="gemini-2.0-flash",
|
| 34 |
-
temperature=0,
|
| 35 |
-
max_tokens=None,
|
| 36 |
-
timeout=None,
|
| 37 |
-
max_retries=2,
|
| 38 |
-
)
|
| 39 |
-
|
| 40 |
-
# Process marketing copy
|
| 41 |
-
marketing_prompt_template = PromptTemplate(
|
| 42 |
-
template="""Transform the following transcription into engaging marketing copy.
|
| 43 |
-
Follow these guidelines only and only give the body text: {marketing_prompt}
|
| 44 |
-
|
| 45 |
-
Transcription:
|
| 46 |
-
{transcription_text}
|
| 47 |
-
|
| 48 |
-
Generate marketing copy that is compelling and aligned with the provided guidelines.
|
| 49 |
-
Focus on key benefits, unique selling points, and engaging narrative.""",
|
| 50 |
-
input_variables=["transcription_text", "marketing_prompt"],
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
-
marketing_chain = marketing_prompt_template | model
|
| 54 |
-
marketing_result = marketing_chain.invoke(
|
| 55 |
-
{"transcription_text": transcription_text, "marketing_prompt": marketing_prompt}
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
# Process project metrics
|
| 59 |
-
metrics_prompt_template = PromptTemplate(
|
| 60 |
-
template="""Extract project metrics and statistics from the following transcription.
|
| 61 |
-
Focus on these aspects: metrics should only and only be the in proper format avoid adding any description or other things if there is nothing can be found do not put in the output
|
| 62 |
-
for consultants and and project team only and only output a result if there is a first name and last or a entity name can be found. avoid general name such as structural consultants, lighting consultant etc. outputs should be a list of strings for the names
|
| 63 |
-
only use these keys when possible and relevant location, project_name, size, height, number_of_floors, completion_date, client_name, project_team_members, external_consultants
|
| 64 |
-
{metrics_prompt}
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
Transcription:
|
| 68 |
-
{transcription_text}
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
Generate a JSON object containing the extracted metrics and statistics.
|
| 72 |
-
Be specific and quantitative where possible.""",
|
| 73 |
-
input_variables=["transcription_text", "metrics_prompt"],
|
| 74 |
-
)
|
| 75 |
-
|
| 76 |
-
metrics_chain = metrics_prompt_template | model | parser
|
| 77 |
-
metrics_result = metrics_chain.invoke(
|
| 78 |
-
{"transcription_text": transcription_text, "metrics_prompt": metrics_prompt}
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
# Format metrics result for display
|
| 82 |
-
|
| 83 |
-
metrics_result["description"] = marketing_result.content
|
| 84 |
-
# formatted_metrics = json.dumps(metrics_result, indent=2)
|
| 85 |
-
|
| 86 |
-
return metrics_result
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
def find_story_files(idml_package, tag_patterns):
|
| 90 |
-
"""
|
| 91 |
-
Find story files containing specific tags
|
| 92 |
-
|
| 93 |
-
Args:
|
| 94 |
-
idml_package: The IDML package
|
| 95 |
-
tag_patterns: List of tag patterns to search for
|
| 96 |
-
|
| 97 |
-
Returns:
|
| 98 |
-
dict: Mapping of tag patterns to story files
|
| 99 |
-
"""
|
| 100 |
-
compiled_patterns = {pattern: re.compile(pattern) for pattern in tag_patterns}
|
| 101 |
-
|
| 102 |
-
tag_to_story = {pattern: [] for pattern in tag_patterns}
|
| 103 |
-
|
| 104 |
-
stories = [name for name in idml_package.namelist() if name.startswith("Stories/")]
|
| 105 |
-
|
| 106 |
-
for story_path in stories:
|
| 107 |
-
try:
|
| 108 |
-
content = idml_package.open(story_path).read().decode("utf-8")
|
| 109 |
-
for pattern, regex in compiled_patterns.items():
|
| 110 |
-
if regex.search(content):
|
| 111 |
-
tag_to_story[pattern].append(story_path)
|
| 112 |
-
except Exception as e:
|
| 113 |
-
print(f"Error reading {story_path}: {e}")
|
| 114 |
-
|
| 115 |
-
return tag_to_story
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
def replace_content(xml_content, tag_pattern, replacements):
|
| 119 |
-
"""
|
| 120 |
-
Replace content tags with actual data
|
| 121 |
-
|
| 122 |
-
Args:
|
| 123 |
-
xml_content: The XML content to modify
|
| 124 |
-
tag_pattern: The regex pattern to match tags
|
| 125 |
-
replacements: List of replacement values
|
| 126 |
-
|
| 127 |
-
Returns:
|
| 128 |
-
str: Updated XML content
|
| 129 |
-
"""
|
| 130 |
-
tags = re.finditer(tag_pattern, xml_content)
|
| 131 |
-
tag_positions = [(m.start(), m.end()) for m in tags]
|
| 132 |
-
|
| 133 |
-
if not tag_positions:
|
| 134 |
-
return xml_content
|
| 135 |
-
|
| 136 |
-
content_chars = list(xml_content)
|
| 137 |
-
|
| 138 |
-
for i, (start, end) in enumerate(reversed(tag_positions)):
|
| 139 |
-
index = len(tag_positions) - 1 - i # Reverse index
|
| 140 |
-
|
| 141 |
-
if index < len(replacements):
|
| 142 |
-
# Replace with actual data
|
| 143 |
-
new_content = f"<Content>{replacements[index]}</Content>"
|
| 144 |
-
content_chars[start:end] = new_content
|
| 145 |
-
else:
|
| 146 |
-
br_pattern = r"\s*<Br />"
|
| 147 |
-
br_match = re.search(br_pattern, "".join(content_chars[end : end + 20]))
|
| 148 |
-
if br_match:
|
| 149 |
-
del content_chars[start : end + br_match.end()]
|
| 150 |
-
else:
|
| 151 |
-
del content_chars[start:end]
|
| 152 |
-
|
| 153 |
-
if len(replacements) > len(tag_positions) and tag_positions:
|
| 154 |
-
last_pos = tag_positions[-1][1]
|
| 155 |
-
|
| 156 |
-
for item in replacements[len(tag_positions) :]:
|
| 157 |
-
insert_content = f"\n<Content>{item}</Content>\n<Br />"
|
| 158 |
-
content_chars.insert(last_pos, insert_content)
|
| 159 |
-
last_pos += len(insert_content)
|
| 160 |
-
|
| 161 |
-
return "".join(content_chars)
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
def update_idml_content(idml_path, replacements_json):
|
| 165 |
-
"""
|
| 166 |
-
Update IDML content with replacements from JSON
|
| 167 |
-
|
| 168 |
-
Args:
|
| 169 |
-
idml_path: Path to the IDML file
|
| 170 |
-
replacements_json: JSON string or dict with tag patterns and replacements
|
| 171 |
-
|
| 172 |
-
Returns:
|
| 173 |
-
str: Path to the updated IDML file
|
| 174 |
-
"""
|
| 175 |
-
# Parse JSON if it's a string
|
| 176 |
-
if isinstance(replacements_json, str):
|
| 177 |
-
replacements = json.loads(replacements_json)
|
| 178 |
-
else:
|
| 179 |
-
replacements = replacements_json
|
| 180 |
-
|
| 181 |
-
# Get the directory where app.py is located
|
| 182 |
-
app_dir = os.path.dirname(os.path.abspath(__file__))
|
| 183 |
-
|
| 184 |
-
# Create a temporary directory
|
| 185 |
-
with tempfile.TemporaryDirectory() as temp_dir:
|
| 186 |
-
# Create a copy of the IDML file to work with
|
| 187 |
-
temp_idml = os.path.join(temp_dir, "temp.idml")
|
| 188 |
-
shutil.copy2(idml_path, temp_idml)
|
| 189 |
-
|
| 190 |
-
with idml.IDMLPackage(temp_idml) as working_idml:
|
| 191 |
-
# Find all story files containing our tags
|
| 192 |
-
tag_patterns = list(replacements.keys())
|
| 193 |
-
tag_to_story = find_story_files(working_idml, tag_patterns)
|
| 194 |
-
|
| 195 |
-
# Extract the IDML
|
| 196 |
-
extract_dir = os.path.join(temp_dir, "extracted")
|
| 197 |
-
os.makedirs(extract_dir, exist_ok=True)
|
| 198 |
-
working_idml.extractall(extract_dir)
|
| 199 |
-
|
| 200 |
-
# Process each tag pattern
|
| 201 |
-
for tag_pattern, replacement_values in replacements.items():
|
| 202 |
-
story_files = tag_to_story.get(tag_pattern, [])
|
| 203 |
-
|
| 204 |
-
if not story_files:
|
| 205 |
-
print(
|
| 206 |
-
f"Warning: No story files found containing pattern '{tag_pattern}'"
|
| 207 |
-
)
|
| 208 |
-
continue
|
| 209 |
-
|
| 210 |
-
print(
|
| 211 |
-
f"Found pattern '{tag_pattern}' in {len(story_files)} story file(s)"
|
| 212 |
-
)
|
| 213 |
-
|
| 214 |
-
# Update each story file containing this tag
|
| 215 |
-
for story_path in story_files:
|
| 216 |
-
# Read the XML content
|
| 217 |
-
with open(
|
| 218 |
-
os.path.join(extract_dir, story_path), "r", encoding="utf-8"
|
| 219 |
-
) as f:
|
| 220 |
-
xml_content = f.read()
|
| 221 |
-
|
| 222 |
-
# Update the content
|
| 223 |
-
updated_content = replace_content(
|
| 224 |
-
xml_content, tag_pattern, replacement_values
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
-
# Write back the updated content
|
| 228 |
-
with open(
|
| 229 |
-
os.path.join(extract_dir, story_path), "w", encoding="utf-8"
|
| 230 |
-
) as f:
|
| 231 |
-
f.write(updated_content)
|
| 232 |
-
|
| 233 |
-
# Create the output path in the same directory as app.py
|
| 234 |
-
base_name = os.path.splitext(os.path.basename(idml_path))[0]
|
| 235 |
-
output_filename = (
|
| 236 |
-
f"{base_name}_filled_{datetime.now().strftime('%Y%m%d%H%M%S')}.idml"
|
| 237 |
-
)
|
| 238 |
-
output_path = os.path.join(app_dir, output_filename)
|
| 239 |
-
|
| 240 |
-
# Create a new IDML with the updated content
|
| 241 |
-
shutil.make_archive(output_path, "zip", extract_dir)
|
| 242 |
-
os.rename(output_path + ".zip", output_path)
|
| 243 |
-
|
| 244 |
-
print(f"Updated IDML saved to: {output_path}")
|
| 245 |
-
return output_path
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
def create_replacements_from_metrics(metrics_data):
|
| 249 |
-
"""
|
| 250 |
-
Convert metrics data to the replacements dictionary format
|
| 251 |
-
|
| 252 |
-
Args:
|
| 253 |
-
metrics_data: Dictionary containing project metrics
|
| 254 |
-
|
| 255 |
-
Returns:
|
| 256 |
-
dict: Mapping of tag patterns to replacement values
|
| 257 |
-
"""
|
| 258 |
-
# Define mappings between metrics keys and IDML tag patterns
|
| 259 |
-
replacements = {
|
| 260 |
-
# Project Description
|
| 261 |
-
r"<Content><Description></Content>": [
|
| 262 |
-
metrics_data.get("description", "")
|
| 263 |
-
],
|
| 264 |
-
# Project name
|
| 265 |
-
r"<Content><Project Name></Content>": [
|
| 266 |
-
metrics_data.get("project_name", "")
|
| 267 |
-
],
|
| 268 |
-
# Location
|
| 269 |
-
r"<Content><Location></Content>": [metrics_data.get("location", "")],
|
| 270 |
-
# Size/Area
|
| 271 |
-
r"<Content><Area> SF</Content>": [metrics_data.get("size", "")],
|
| 272 |
-
# Number of floors
|
| 273 |
-
r"<Content><NumFloors></Content>": [
|
| 274 |
-
metrics_data.get("number_of_floors", "")
|
| 275 |
-
],
|
| 276 |
-
# Completion date
|
| 277 |
-
r"<Content><DateComplete> \(<Phase>\)</Content>": [
|
| 278 |
-
f"{metrics_data.get('completion_date', '')}"
|
| 279 |
-
],
|
| 280 |
-
# Client
|
| 281 |
-
r"<Content><Client></Content>": [metrics_data.get("client_name", "")],
|
| 282 |
-
# Team members - format each with a placeholder role
|
| 283 |
-
r"<Content><TEAM\d+> \(<Role\d+>\)</Content>": [
|
| 284 |
-
f"{member} " for member in metrics_data.get("project_team_members", [])
|
| 285 |
-
],
|
| 286 |
-
# Consultants
|
| 287 |
-
r"<Content><Consultant\d+></Content>": [
|
| 288 |
-
consultant for consultant in metrics_data.get("external_consultants", [])
|
| 289 |
-
],
|
| 290 |
-
}
|
| 291 |
-
|
| 292 |
-
return replacements
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
def process_and_update_idml(file, marketing_prompt, metrics_prompt, idml_path):
|
| 296 |
-
try:
|
| 297 |
-
# Ensure file has an extension
|
| 298 |
-
if not os.path.splitext(file.name)[1]:
|
| 299 |
-
temp_file = tempfile.NamedTemporaryFile(suffix=".docx", delete=False)
|
| 300 |
-
shutil.copy2(file.name, temp_file.name)
|
| 301 |
-
file = temp_file.name
|
| 302 |
-
|
| 303 |
-
# Process the transcription
|
| 304 |
-
results = process_transcription(file, marketing_prompt, metrics_prompt)
|
| 305 |
-
|
| 306 |
-
# Ensure IDML file has extension
|
| 307 |
-
if not os.path.splitext(idml_path.name)[1]:
|
| 308 |
-
temp_idml = tempfile.NamedTemporaryFile(suffix=".idml", delete=False)
|
| 309 |
-
shutil.copy2(idml_path.name, temp_idml.name)
|
| 310 |
-
idml_path = temp_idml.name
|
| 311 |
-
|
| 312 |
-
# Prepare the replacements dictionary
|
| 313 |
-
replacements = create_replacements_from_metrics(results)
|
| 314 |
-
|
| 315 |
-
# Update the IDML file
|
| 316 |
-
output_path = update_idml_content(idml_path, replacements)
|
| 317 |
-
|
| 318 |
-
return (
|
| 319 |
-
f"Successfully processed and updated IDML. Output saved to: {output_path}",
|
| 320 |
-
json.dumps(results, indent=2),
|
| 321 |
-
output_path,
|
| 322 |
-
)
|
| 323 |
-
except Exception as e:
|
| 324 |
-
error_json = {
|
| 325 |
-
"error": str(e),
|
| 326 |
-
"description": "An error occurred during processing",
|
| 327 |
-
}
|
| 328 |
-
return f"Error: {str(e)}", json.dumps(error_json, indent=2), None
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
# Create Gradio interface
|
| 332 |
-
iface = gr.Interface(
|
| 333 |
-
fn=process_and_update_idml,
|
| 334 |
-
inputs=[
|
| 335 |
-
gr.File(label="Upload Transcription File (DOCX)", file_types=[".docx"]),
|
| 336 |
-
gr.Textbox(
|
| 337 |
-
label="Marketing Prompt",
|
| 338 |
-
value="create short paragraph with friendly tone focusing on the sustainability aspects of the project",
|
| 339 |
-
lines=3,
|
| 340 |
-
),
|
| 341 |
-
gr.Textbox(
|
| 342 |
-
label="Metrics Prompt",
|
| 343 |
-
value="extract project name, location, Size in square feet, number of floors, total height, completion date, client name, project team members name and any external consultants",
|
| 344 |
-
lines=3,
|
| 345 |
-
),
|
| 346 |
-
gr.File(label="Upload indesign template (idml)", file_types=[".idml"]),
|
| 347 |
-
],
|
| 348 |
-
outputs=[
|
| 349 |
-
gr.Textbox(label="IDML Update Status", lines=2),
|
| 350 |
-
gr.JSON(label="Transcription Results"),
|
| 351 |
-
gr.File(label="Download Updated IDML"),
|
| 352 |
-
],
|
| 353 |
-
title="Marketing Transcription Processor",
|
| 354 |
-
description="Upload a transcription file and IDML template to generate marketing content and update the IDML file.",
|
| 355 |
-
)
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|