Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
-
"""NetCom → WooCommerce transformer (Try
|
| 2 |
-
|
| 3 |
*Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
|
| 4 |
|
| 5 |
-
Changes vs Try
|
| 6 |
-
|
| 7 |
-
*
|
| 8 |
-
*
|
| 9 |
-
|
| 10 |
-
*
|
| 11 |
"""
|
| 12 |
|
| 13 |
from __future__ import annotations
|
|
@@ -25,7 +25,7 @@ import gradio_client.utils
|
|
| 25 |
import openai
|
| 26 |
import pandas as pd
|
| 27 |
|
| 28 |
-
# -------- Gradio bool
|
| 29 |
_original = gradio_client.utils._json_schema_to_python_type
|
| 30 |
|
| 31 |
def _fixed_json_schema_to_python_type(schema, defs=None): # type: ignore
|
|
@@ -55,24 +55,42 @@ def _set_cache(p: str, r: str) -> None:
|
|
| 55 |
pass
|
| 56 |
|
| 57 |
# -------- Async GPT helpers --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
| 58 |
async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
|
| 59 |
-
"""Single LLM call with on
|
| 60 |
cached = _get_cached(prompt)
|
| 61 |
if cached is not None:
|
| 62 |
return cached
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
try:
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
temperature=0,
|
| 69 |
-
)
|
| 70 |
-
text = msg.choices[0].message.content
|
| 71 |
-
except Exception as exc: # network or auth failure ‑ return explicit error string
|
| 72 |
-
text = f"Error: {exc}"
|
| 73 |
-
|
| 74 |
-
_set_cache(prompt, text)
|
| 75 |
-
return text
|
| 76 |
|
| 77 |
async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
|
| 78 |
"""Vectorised helper — returns an output list matching *lst* length."""
|
|
@@ -82,11 +100,10 @@ async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpe
|
|
| 82 |
if isinstance(txt, str) and txt.strip():
|
| 83 |
idx.append(i)
|
| 84 |
prompts.append(f"{instruction}\n\nText: {txt}")
|
| 85 |
-
#
|
| 86 |
-
if not prompts:
|
| 87 |
return out
|
| 88 |
|
| 89 |
-
#
|
| 90 |
responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
|
| 91 |
for j, val in enumerate(responses):
|
| 92 |
out[idx[j]] = val
|
|
@@ -107,7 +124,6 @@ def _read(path: str) -> pd.DataFrame:
|
|
| 107 |
async def _enrich_dataframe(df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
|
| 108 |
"""Run all LLM batches concurrently and return the five enrichment columns."""
|
| 109 |
async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
|
| 110 |
-
# 1) Descriptions and objectives/agenda batches
|
| 111 |
sdesc, ldesc, fobj, fout = await asyncio.gather(
|
| 112 |
_batch_async(df.get(dcol, "").fillna("").tolist(),
|
| 113 |
"Create a concise 250-character summary of this course description:", client),
|
|
@@ -118,7 +134,7 @@ async def _enrich_dataframe(df: pd.DataFrame, dcol: str, ocol: str, pcol: str, a
|
|
| 118 |
_batch_async(df.get(acol, "").fillna("").tolist(),
|
| 119 |
"Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':", client),
|
| 120 |
)
|
| 121 |
-
#
|
| 122 |
prereq_raw = df.get(pcol, "").fillna("").tolist()
|
| 123 |
fpre: list[str] = []
|
| 124 |
for req in prereq_raw:
|
|
@@ -150,7 +166,6 @@ def convert(path: str) -> BytesIO:
|
|
| 150 |
df = _read(path)
|
| 151 |
df.columns = df.columns.str.strip()
|
| 152 |
|
| 153 |
-
# Helper to locate first existing column name from a list of candidates
|
| 154 |
first_col = lambda *candidates: next((c for c in candidates if c in df.columns), None)
|
| 155 |
|
| 156 |
dcol = first_col("Description", "Decription")
|
|
@@ -161,7 +176,7 @@ def convert(path: str) -> BytesIO:
|
|
| 161 |
sid = first_col("Course SID", "Course SID")
|
| 162 |
|
| 163 |
if dur not in df.columns:
|
| 164 |
-
df[dur] = "" #
|
| 165 |
|
| 166 |
# ---------- LLM enrichment (async) -------------------------------------
|
| 167 |
sdesc, ldesc, fobj, fout, fpre = asyncio.run(_enrich_dataframe(df, dcol, ocol, pcol, acol))
|
|
@@ -297,8 +312,8 @@ ui = gr.Interface(
|
|
| 297 |
fn=process_file,
|
| 298 |
inputs=gr.File(label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]),
|
| 299 |
outputs=gr.File(label="Download WooCommerce CSV"),
|
| 300 |
-
title="NetCom → WooCommerce CSV Processor (Try
|
| 301 |
-
description="Upload NetCom schedule (.csv/.xlsx) to get the Try
|
| 302 |
analytics_enabled=False,
|
| 303 |
)
|
| 304 |
|
|
|
|
| 1 |
+
"""NetCom → WooCommerce transformer (Try 2 schema — cleaned async, 100-parallel + de-dupe)
|
| 2 |
+
=========================================================================================
|
| 3 |
*Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
|
| 4 |
|
| 5 |
+
Changes vs previous Try 2
|
| 6 |
+
-------------------------
|
| 7 |
+
* Cap OpenAI concurrency to **100** with an `asyncio.Semaphore`.
|
| 8 |
+
* Prevent duplicate prompts from hitting the API in parallel with an
|
| 9 |
+
in-flight `dict[str, asyncio.Future]`.
|
| 10 |
+
* Everything else (cache filenames, interface, outputs) stays the same.
|
| 11 |
"""
|
| 12 |
|
| 13 |
from __future__ import annotations
|
|
|
|
| 25 |
import openai
|
| 26 |
import pandas as pd
|
| 27 |
|
| 28 |
+
# -------- Gradio bool-schema hot-patch --------------------------------------
|
| 29 |
_original = gradio_client.utils._json_schema_to_python_type
|
| 30 |
|
| 31 |
def _fixed_json_schema_to_python_type(schema, defs=None): # type: ignore
|
|
|
|
| 55 |
pass
|
| 56 |
|
| 57 |
# -------- Async GPT helpers --------------------------------------------------
|
| 58 |
+
_SEM = asyncio.Semaphore(100) # ≤100 concurrent calls
|
| 59 |
+
_inflight: dict[str, asyncio.Future] = {} # prompt → Future
|
| 60 |
+
|
| 61 |
async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
|
| 62 |
+
"""Single LLM call with on-disk cache, concurrency cap, and de-duplication."""
|
| 63 |
cached = _get_cached(prompt)
|
| 64 |
if cached is not None:
|
| 65 |
return cached
|
| 66 |
|
| 67 |
+
# — de-dup identical prompts already in flight —
|
| 68 |
+
running = _inflight.get(prompt)
|
| 69 |
+
if running is not None:
|
| 70 |
+
return await running # reuse the same Future
|
| 71 |
+
|
| 72 |
+
loop = asyncio.get_running_loop()
|
| 73 |
+
|
| 74 |
+
async def _call_api() -> str:
|
| 75 |
+
async with _SEM: # concurrency limiter
|
| 76 |
+
try:
|
| 77 |
+
msg = await client.chat.completions.create(
|
| 78 |
+
model="gpt-4o-mini",
|
| 79 |
+
messages=[{"role": "user", "content": prompt}],
|
| 80 |
+
temperature=0,
|
| 81 |
+
)
|
| 82 |
+
text = msg.choices[0].message.content
|
| 83 |
+
except Exception as exc: # network/auth errors
|
| 84 |
+
text = f"Error: {exc}"
|
| 85 |
+
_set_cache(prompt, text)
|
| 86 |
+
return text
|
| 87 |
+
|
| 88 |
+
task = loop.create_task(_call_api())
|
| 89 |
+
_inflight[prompt] = task # register
|
| 90 |
try:
|
| 91 |
+
return await task
|
| 92 |
+
finally:
|
| 93 |
+
_inflight.pop(prompt, None) # clean up even on error
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
async def _batch_async(lst: list[str], instruction: str, client: openai.AsyncOpenAI) -> list[str]:
|
| 96 |
"""Vectorised helper — returns an output list matching *lst* length."""
|
|
|
|
| 100 |
if isinstance(txt, str) and txt.strip():
|
| 101 |
idx.append(i)
|
| 102 |
prompts.append(f"{instruction}\n\nText: {txt}")
|
| 103 |
+
if not prompts: # fast-path: all rows empty
|
|
|
|
| 104 |
return out
|
| 105 |
|
| 106 |
+
# Gather — duplicate prompts handled inside _gpt_async
|
| 107 |
responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
|
| 108 |
for j, val in enumerate(responses):
|
| 109 |
out[idx[j]] = val
|
|
|
|
| 124 |
async def _enrich_dataframe(df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
|
| 125 |
"""Run all LLM batches concurrently and return the five enrichment columns."""
|
| 126 |
async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
|
|
|
|
| 127 |
sdesc, ldesc, fobj, fout = await asyncio.gather(
|
| 128 |
_batch_async(df.get(dcol, "").fillna("").tolist(),
|
| 129 |
"Create a concise 250-character summary of this course description:", client),
|
|
|
|
| 134 |
_batch_async(df.get(acol, "").fillna("").tolist(),
|
| 135 |
"Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':", client),
|
| 136 |
)
|
| 137 |
+
# Prerequisites (some rows empty → default)
|
| 138 |
prereq_raw = df.get(pcol, "").fillna("").tolist()
|
| 139 |
fpre: list[str] = []
|
| 140 |
for req in prereq_raw:
|
|
|
|
| 166 |
df = _read(path)
|
| 167 |
df.columns = df.columns.str.strip()
|
| 168 |
|
|
|
|
| 169 |
first_col = lambda *candidates: next((c for c in candidates if c in df.columns), None)
|
| 170 |
|
| 171 |
dcol = first_col("Description", "Decription")
|
|
|
|
| 176 |
sid = first_col("Course SID", "Course SID")
|
| 177 |
|
| 178 |
if dur not in df.columns:
|
| 179 |
+
df[dur] = "" # ensure Duration col exists
|
| 180 |
|
| 181 |
# ---------- LLM enrichment (async) -------------------------------------
|
| 182 |
sdesc, ldesc, fobj, fout, fpre = asyncio.run(_enrich_dataframe(df, dcol, ocol, pcol, acol))
|
|
|
|
| 312 |
fn=process_file,
|
| 313 |
inputs=gr.File(label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]),
|
| 314 |
outputs=gr.File(label="Download WooCommerce CSV"),
|
| 315 |
+
title="NetCom → WooCommerce CSV Processor (Try 2)",
|
| 316 |
+
description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",
|
| 317 |
analytics_enabled=False,
|
| 318 |
)
|
| 319 |
|