hasanbasbunar commited on
Commit
a6a41c4
·
verified ·
1 Parent(s): ec86ec8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1137 -0
app.py ADDED
@@ -0,0 +1,1137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import tempfile
4
+ import os
5
+ import base64
6
+ import re
7
+ from io import BytesIO
8
+ from PIL import Image
9
+ from typing import Optional
10
+ from pydantic import BaseModel, Field, create_model
11
+ from datetime import date
12
+ from openai import OpenAI
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
+
17
+ FIELD_FORMATS = [
18
+ "text",
19
+ "date",
20
+ "number",
21
+ "true/false",
22
+ "empty",
23
+ "multiple choice",
24
+ "unit",
25
+ ]
26
+ NAME_MAX_CHARS = 100
27
+ PROMPT_MAX_CHARS = 300
28
+
29
+ def normalize_format_label(fmt_raw: str) -> str:
30
+ mapping = {
31
+ # French → English
32
+ "texte": "text",
33
+ "date": "date",
34
+ "nombre": "number",
35
+ "vrai/faux": "true/false",
36
+ "vide": "empty",
37
+ "choix multiple": "multiple choice",
38
+ "unité": "unit",
39
+ # English (idempotent)
40
+ "text": "text",
41
+ "number": "number",
42
+ "true/false": "true/false",
43
+ "empty": "empty",
44
+ "multiple choice": "multiple choice",
45
+ "unit": "unit",
46
+ }
47
+ return mapping.get(str(fmt_raw or "").strip().lower(), "text")
48
+
49
+
50
+ IDENTIFIER_REGEX = re.compile(r"^[A-Za-z][A-Za-z0-9_-]{0,99}$")
51
+
52
+
53
+ def is_image_url(url: str) -> bool:
54
+ if not url:
55
+ return False
56
+ u = url.strip().lower()
57
+ if not (u.startswith("http://") or u.startswith("https://")):
58
+ return False
59
+ # Accept common raster image extensions only
60
+ allowed_exts = (".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif")
61
+ # remove querystring/fragment before checking suffix
62
+ base = u.split("?")[0].split("#")[0]
63
+ return base.endswith(allowed_exts)
64
+
65
+
66
+ def is_valid_ascii_identifier(value: str) -> bool:
67
+ s = str(value or "").strip()
68
+ if not s:
69
+ return False
70
+ if not IDENTIFIER_REGEX.match(s):
71
+ return False
72
+ try:
73
+ s.encode("ascii")
74
+ except Exception:
75
+ return False
76
+ return True
77
+
78
+
79
+ def live_validate_field_name(name: str):
80
+ msg = ""
81
+ if not is_valid_ascii_identifier(name):
82
+ msg = "Only ASCII letters, digits, '_' or '-' allowed; start with a letter; no spaces or accents."
83
+ html = f"<span style='color:#dc2626;font-weight:600'>{msg}</span>" if msg else ""
84
+ return gr.update(value=html, visible=bool(msg)), gr.update(interactive=(msg == ""))
85
+
86
+
87
+ def live_validate_choice(choice: str):
88
+ msg = ""
89
+ c = (choice or "").strip()
90
+ if not c:
91
+ msg = "Enter a non-empty choice."
92
+ elif not is_valid_ascii_identifier(c):
93
+ msg = "Only ASCII letters, digits, '_' or '-' allowed; start with a letter; no spaces or accents."
94
+ html = f"<span style='color:#dc2626;font-weight:600'>{msg}</span>" if msg else ""
95
+ return gr.update(value=html, visible=bool(msg)), gr.update(interactive=(msg == ""))
96
+
97
+
98
+ def error_update(msg: str):
99
+ return gr.update(value=f"<span style='color:#dc2626;font-weight:600'>{msg}</span>", visible=True)
100
+
101
+ def fields_to_rows(fields):
102
+ return [[
103
+ f["name"],
104
+ f["format"],
105
+ f.get("description", ""),
106
+ f.get("details", ""),
107
+ ] for f in fields]
108
+
109
+ def names_from_fields(fields):
110
+ return [str(f.get("name", "")) for f in (fields or [])]
111
+
112
+
113
+ def add_field(name, field_format, description, choices_list, unit, fields):
114
+ name = (name or "").strip()
115
+ field_format = normalize_format_label(field_format)
116
+ description = (description or "").strip()
117
+ # choices_list is a list of strings when format == "multiple choice"
118
+ unit = (unit or "").strip()
119
+
120
+ # validations
121
+ if not name:
122
+ return (
123
+ error_update("⚠️ Field name is required."),
124
+ (fields or []),
125
+ fields_to_rows(fields or []),
126
+ gr.update(choices=names_from_fields(fields or []), value=None, visible=len(fields or []) > 0),
127
+ gr.update(visible=len(fields or []) > 0),
128
+ ready_update_from_fields(fields or []),
129
+ )
130
+ if not is_valid_ascii_identifier(name):
131
+ return (
132
+ error_update("⚠️ Invalid field name: use ASCII letters, digits, '_' or '-'; start with a letter; no spaces or accents."),
133
+ (fields or []),
134
+ fields_to_rows(fields or []),
135
+ gr.update(choices=names_from_fields(fields or []), value=None, visible=len(fields or []) > 0),
136
+ gr.update(visible=len(fields or []) > 0),
137
+ ready_update_from_fields(fields or []),
138
+ )
139
+ # uniqueness (case-insensitive, trimmed)
140
+ existing = {str(f.get("name", "")).strip().lower() for f in (fields or [])}
141
+ if name.lower() in existing:
142
+ return (
143
+ error_update("⚠️ This field name already exists."),
144
+ (fields or []),
145
+ fields_to_rows(fields or []),
146
+ gr.update(choices=names_from_fields(fields or []), value=None, visible=len(fields or []) > 0),
147
+ gr.update(visible=len(fields or []) > 0),
148
+ ready_update_from_fields(fields or []),
149
+ )
150
+ if len(name) > NAME_MAX_CHARS:
151
+ return (
152
+ error_update(f"⚠️ Name too long (max {NAME_MAX_CHARS} characters)."),
153
+ (fields or []),
154
+ fields_to_rows(fields or []),
155
+ gr.update(choices=names_from_fields(fields or []), value=None, visible=len(fields or []) > 0),
156
+ gr.update(visible=len(fields or []) > 0),
157
+ ready_update_from_fields(fields or []),
158
+ )
159
+ if len(description) > PROMPT_MAX_CHARS:
160
+ return (
161
+ error_update(f"⚠️ Description too long (max {PROMPT_MAX_CHARS} characters)."),
162
+ (fields or []),
163
+ fields_to_rows(fields or []),
164
+ gr.update(choices=names_from_fields(fields or []), value=None, visible=len(fields or []) > 0),
165
+ gr.update(visible=len(fields or []) > 0),
166
+ ready_update_from_fields(fields or []),
167
+ )
168
+
169
+ new_fields = list(fields or [])
170
+ details = ""
171
+ if field_format == "multiple choice":
172
+ options = [c for c in (choices_list or []) if str(c).strip()]
173
+ if len(options) < 2:
174
+ return (
175
+ error_update("⚠️ For ‘multiple choice’, add at least 2 choices."),
176
+ (fields or []),
177
+ fields_to_rows(fields or []),
178
+ gr.update(choices=names_from_fields(fields or []), value=None, visible=len(fields or []) > 0),
179
+ gr.update(visible=len(fields or []) > 0),
180
+ ready_update_from_fields(fields or []),
181
+ )
182
+ normalized = [str(c).strip().lower() for c in options]
183
+ if len(set(normalized)) != len(options):
184
+ return (
185
+ error_update("⚠️ For ‘multiple choice’, choices must be unique."),
186
+ (fields or []),
187
+ fields_to_rows(fields or []),
188
+ gr.update(choices=names_from_fields(fields or []), value=None, visible=len(fields or []) > 0),
189
+ gr.update(visible=len(fields or []) > 0),
190
+ ready_update_from_fields(fields or []),
191
+ )
192
+ if options:
193
+ details = "choices: " + " | ".join(options)
194
+ elif field_format == "unit":
195
+ if unit:
196
+ details = f"unit: {unit}"
197
+ new_fields.append({
198
+ "name": name,
199
+ "format": field_format,
200
+ "description": description,
201
+ "details": details,
202
+ "options": options if field_format == "multiple choice" else [],
203
+ "unit": unit if field_format == "unit" else "",
204
+ })
205
+ return (
206
+ gr.update(value="", visible=False),
207
+ new_fields,
208
+ fields_to_rows(new_fields),
209
+ gr.update(choices=names_from_fields(new_fields), value=None, visible=len(new_fields) > 0),
210
+ gr.update(visible=len(new_fields) > 0),
211
+ ready_update_from_fields(new_fields),
212
+ )
213
+
214
+
215
+ def delete_field(delete_name, fields):
216
+ current_fields = list(fields or [])
217
+ if not delete_name:
218
+ return (
219
+ error_update("⚠️ Select a field to delete."),
220
+ current_fields,
221
+ fields_to_rows(current_fields),
222
+ gr.update(choices=names_from_fields(current_fields), value=None, visible=len(current_fields) > 0),
223
+ gr.update(visible=len(current_fields) > 0),
224
+ ready_update_from_fields(current_fields),
225
+ )
226
+ new_fields = [
227
+ f for f in current_fields
228
+ if str(f.get("name", "")).strip().lower() != str(delete_name).strip().lower()
229
+ ]
230
+ if len(new_fields) == len(current_fields):
231
+ return (
232
+ error_update("⚠️ Field not found."),
233
+ current_fields,
234
+ fields_to_rows(current_fields),
235
+ gr.update(choices=names_from_fields(current_fields), value=None, visible=len(current_fields) > 0),
236
+ gr.update(visible=len(current_fields) > 0),
237
+ ready_update_from_fields(current_fields),
238
+ )
239
+ return (
240
+ gr.update(value="", visible=False),
241
+ new_fields,
242
+ fields_to_rows(new_fields),
243
+ gr.update(choices=names_from_fields(new_fields), value=None, visible=len(new_fields) > 0),
244
+ gr.update(visible=len(new_fields) > 0),
245
+ ready_update_from_fields(new_fields),
246
+ )
247
+
248
+
249
+ def serialize_model(fields):
250
+ return {"version": 1, "fields": list(fields or [])}
251
+
252
+
253
+ def count_message(fields):
254
+ n = len(fields or [])
255
+ if n == 0:
256
+ return "0 field in model"
257
+ if n == 1:
258
+ return "1 field in model"
259
+ return f"{n} fields in model"
260
+
261
+
262
+ def visibility_updates_from_fields(fields):
263
+ has = len(fields or []) > 0
264
+ return (
265
+ gr.update(choices=names_from_fields(fields or []), value=None, visible=has), # delete_dropdown
266
+ gr.update(visible=has), # download_btn
267
+ gr.update(visible=has), # delete_btn
268
+ gr.update(visible=has), # model_filename
269
+ )
270
+
271
+
272
+ def sanitize_filename(name):
273
+ candidate = (name or "").strip()
274
+ if not candidate:
275
+ return "model.json"
276
+ # enlever répertoires et caractères peu sûrs
277
+ candidate = candidate.replace("\\", "/").split("/")[-1]
278
+ allowed = []
279
+ for ch in candidate:
280
+ if ch.isalnum() or ch in ("-", "_", ".", " "):
281
+ allowed.append(ch)
282
+ else:
283
+ allowed.append("-")
284
+ candidate = "".join(allowed)
285
+ if not candidate.lower().endswith(".json"):
286
+ candidate += ".json"
287
+ if len(candidate) > 100:
288
+ candidate = candidate[:100]
289
+ return candidate
290
+
291
+
292
+ def export_model(fields, filename):
293
+ model = serialize_model(fields)
294
+ if not fields:
295
+ return gr.update(visible=False)
296
+ file_name = sanitize_filename(filename)
297
+ temp_dir = tempfile.mkdtemp(prefix="model-")
298
+ path = os.path.join(temp_dir, file_name)
299
+ with open(path, "w", encoding="utf-8") as f:
300
+ json.dump(model, f, ensure_ascii=False, indent=2)
301
+ return gr.update(value=path, visible=True)
302
+
303
+
304
+ def to_python_identifier(name: str) -> str:
305
+ s = str(name or "").strip().lower()
306
+ if not s:
307
+ return "field"
308
+ out = []
309
+ prev_underscore = False
310
+ for ch in s:
311
+ if ch.isalnum():
312
+ out.append(ch)
313
+ prev_underscore = False
314
+ else:
315
+ if not prev_underscore:
316
+ out.append("_")
317
+ prev_underscore = True
318
+ ident = "".join(out).strip("_")
319
+ if not ident:
320
+ ident = "field"
321
+ if ident[0].isdigit():
322
+ ident = f"field_{ident}"
323
+ return ident
324
+
325
+
326
+ def generate_pydantic_code(fields, class_name: str = "DocumentModel") -> str:
327
+ fields = list(fields or [])
328
+ uses_optional = any((normalize_format_label(f.get("format")) == "empty") for f in fields)
329
+ uses_literal = any((normalize_format_label(f.get("format")) == "multiple choice" and f.get("options")) for f in fields)
330
+ uses_date = any((normalize_format_label(f.get("format")) == "date") for f in fields)
331
+
332
+ def type_for(f):
333
+ fmt = normalize_format_label(f.get("format"))
334
+ options = f.get("options", [])
335
+ if fmt == "text":
336
+ return "str", False
337
+ if fmt == "date":
338
+ return "date", False
339
+ if fmt == "number":
340
+ return "float", False
341
+ if fmt == "true/false":
342
+ return "bool", False
343
+ if fmt == "empty":
344
+ return "Optional[str]", True
345
+ if fmt == "multiple choice":
346
+ if options:
347
+ lits = ", ".join(repr(str(o)) for o in options)
348
+ return f"Literal[{lits}]", False
349
+ return "str", False
350
+ if fmt == "unit":
351
+ return "float", False
352
+ return "str", False
353
+
354
+ lines = []
355
+ lines.append("from pydantic import BaseModel, Field")
356
+ if uses_optional:
357
+ lines.append("from typing import Optional")
358
+ if uses_literal:
359
+ lines.append("from typing import Literal")
360
+ if uses_date:
361
+ lines.append("from datetime import date")
362
+ lines.append("")
363
+ lines.append(f"class {class_name}(BaseModel):")
364
+ if not fields:
365
+ lines.append(" pass")
366
+ return "\n".join(lines)
367
+
368
+ for f in fields:
369
+ raw_name = f.get("name", "")
370
+ ident = to_python_identifier(raw_name)
371
+ typ, is_optional = type_for(f)
372
+ desc = f.get("description", "")
373
+ details = f.get("details", "")
374
+ desc_full = desc if details == "" else (desc + " | " + details)
375
+ lines.append(f" # {raw_name} ({f.get('format')})")
376
+ if is_optional:
377
+ lines.append(f" {ident}: {typ} = Field(None, description={desc_full!r})")
378
+ else:
379
+ lines.append(f" {ident}: {typ} = Field(..., description={desc_full!r})")
380
+ return "\n".join(lines)
381
+
382
+
383
+ def pydantic_code_update_from_fields(fields):
384
+ # Conservé pour compat éventuelle mais rendu non utilisé
385
+ has = len(fields or []) > 0
386
+ if not has:
387
+ return gr.update(value="", visible=False)
388
+ code = generate_pydantic_code(fields)
389
+ return gr.update(value=code, visible=False)
390
+
391
+
392
+ def export_pydantic_py(fields):
393
+ if not fields:
394
+ return gr.update(visible=False)
395
+ code = generate_pydantic_code(fields)
396
+ temp_dir = tempfile.mkdtemp(prefix="pydantic-")
397
+ path = os.path.join(temp_dir, "document_model.py")
398
+ with open(path, "w", encoding="utf-8") as f:
399
+ f.write(code)
400
+ return gr.update(value=path, visible=True)
401
+
402
+
403
+ def build_pydantic_model_class(fields, class_name: str = "DocumentModel"):
404
+ field_definitions = {}
405
+ for f in (fields or []):
406
+ raw_name = f.get("name", "")
407
+ ident = to_python_identifier(raw_name)
408
+ fmt = normalize_format_label(f.get("format"))
409
+ desc = f.get("description", "")
410
+ details = f.get("details", "")
411
+ desc_full = desc if details == "" else (desc + " | " + details)
412
+ options = f.get("options", []) or []
413
+
414
+ json_extra = None
415
+ if fmt == "text":
416
+ typ = str
417
+ default = ...
418
+ elif fmt == "date":
419
+ typ = date
420
+ default = ...
421
+ elif fmt == "number":
422
+ typ = float
423
+ default = ...
424
+ elif fmt == "true/false":
425
+ typ = bool
426
+ default = ...
427
+ elif fmt == "empty":
428
+ typ = Optional[str]
429
+ default = None
430
+ elif fmt == "multiple choice":
431
+ typ = str
432
+ default = ...
433
+ if options:
434
+ json_extra = {"enum": [str(o) for o in options]}
435
+ elif fmt == "unit":
436
+ typ = float
437
+ default = ...
438
+ else:
439
+ typ = str
440
+ default = ...
441
+
442
+ if json_extra is not None:
443
+ field_definitions[ident] = (typ, Field(default, description=desc_full, json_schema_extra=json_extra))
444
+ else:
445
+ field_definitions[ident] = (typ, Field(default, description=desc_full))
446
+
447
+ model = create_model(class_name, **field_definitions)
448
+ return model
449
+
450
+
451
+ def json_schema_from_fields(fields):
452
+ model = build_pydantic_model_class(fields)
453
+ schema = model.model_json_schema()
454
+ return json.dumps(schema, ensure_ascii=False, indent=2)
455
+
456
+
457
+ def instruction_from_fields(fields):
458
+ if not fields:
459
+ return ""
460
+ schema_json = json_schema_from_fields(fields)
461
+ return (
462
+ "Extract the following information from the provided image. "
463
+ "Respond only with a strictly valid JSON that conforms to this JSON Schema (no text outside JSON):\n"
464
+ + schema_json
465
+ )
466
+
467
+
468
+
469
+
470
+
471
+ def document_file_to_data_url_with_error(path: str):
472
+ if not path or not os.path.exists(path):
473
+ return "", "File not found."
474
+ p = str(path).lower()
475
+ if p.endswith(".pdf"):
476
+ try:
477
+ import fitz # PyMuPDF
478
+ except Exception:
479
+ return "", "PDF support requires PyMuPDF. Install with: pip install pymupdf"
480
+ try:
481
+ doc = fitz.open(path)
482
+ if doc.page_count == 0:
483
+ return "", "PDF has no pages."
484
+ page = doc.load_page(0)
485
+ zoom = 300.0 / 72.0
486
+ mat = fitz.Matrix(zoom, zoom)
487
+ pix = page.get_pixmap(matrix=mat, alpha=False)
488
+ png_bytes = pix.tobytes("png")
489
+ b64 = base64.b64encode(png_bytes).decode("utf-8")
490
+ return f"data:image/png;base64,{b64}", None
491
+ except Exception as e:
492
+ return "", f"Failed to render PDF: {e}"
493
+ # Image path
494
+ try:
495
+ with Image.open(path) as im:
496
+ im = im.convert("RGB")
497
+ buf = BytesIO()
498
+ im.save(buf, format="PNG", optimize=True)
499
+ data = buf.getvalue()
500
+ b64 = base64.b64encode(data).decode("utf-8")
501
+ return f"data:image/png;base64,{b64}", None
502
+ except Exception:
503
+ return "", "Invalid image file."
504
+
505
+
506
+
507
+
508
+
509
+ def parse_json_from_text(text: str):
510
+ if text is None:
511
+ return None, "Empty text"
512
+ s = str(text)
513
+ if "```" in s:
514
+ parts = s.split("```")
515
+ if len(parts) >= 3:
516
+ # si bloc balisé, prendre le contenu central
517
+ s = parts[1]
518
+ start = s.find("{")
519
+ end = s.rfind("}")
520
+ if start == -1 or end == -1 or end <= start:
521
+ return None, "JSON not detected"
522
+ candidate = s[start:end + 1]
523
+ try:
524
+ return json.loads(candidate), None
525
+ except Exception as e:
526
+ return None, f"Invalid JSON: {e}"
527
+
528
+
529
+ def validate_output_against_model(fields, text):
530
+ model = build_pydantic_model_class(fields)
531
+ data, err = parse_json_from_text(text)
532
+ if err:
533
+ return False, err, None
534
+ try:
535
+ instance = model.model_validate(data)
536
+ normalized = json.dumps(instance.model_dump(), ensure_ascii=False, indent=2)
537
+ return True, "OK", normalized
538
+ except Exception as e:
539
+ try:
540
+ details = getattr(e, 'errors', lambda: [])()
541
+ msgs = []
542
+ for d in details[:5]:
543
+ loc = ".".join(map(str, d.get('loc', [])))
544
+ msg = d.get('msg', 'error')
545
+ msgs.append(f"- {loc}: {msg}")
546
+ extra = "\n".join(msgs) if msgs else str(e)
547
+ except Exception:
548
+ extra = str(e)
549
+ return False, extra, None
550
+
551
+
552
+ def run_extraction(model_file_extraction, model_file_modeltab, fields_state, image_path, image_url, hf_token):
553
+ # Choose model source: Extraction > Model (upload) > Model (built)
554
+ try:
555
+ selected_fields = None
556
+ # 1) File uploaded in Extraction tab
557
+ if model_file_extraction:
558
+ path = model_file_extraction if isinstance(model_file_extraction, str) else model_file_extraction.get("path")
559
+ if path and os.path.exists(path):
560
+ with open(path, "r", encoding="utf-8") as f:
561
+ data = json.load(f)
562
+ fields_raw = data.get("fields", []) if isinstance(data, dict) else []
563
+ cleaned = []
564
+ seen = set()
565
+ for item in fields_raw:
566
+ name = str(item.get("name", "")).strip()
567
+ fmt = normalize_format_label(str(item.get("format", "")).strip())
568
+ description = str(item.get("description", ""))
569
+ options = item.get("options", []) if isinstance(item, dict) else []
570
+ unit = str(item.get("unit", ""))
571
+ if not name or len(name) > NAME_MAX_CHARS or not is_valid_ascii_identifier(name):
572
+ yield ("", gr.update(value="⚠️ Invalid model: field name must be ASCII [A-Za-z][A-Za-z0-9_-]* and <= length limit.", visible=True))
573
+ return
574
+ key = name.lower()
575
+ if key in seen:
576
+ yield ("", gr.update(value="⚠️ Invalid model: duplicate field names.", visible=True))
577
+ return
578
+ seen.add(key)
579
+ if fmt not in FIELD_FORMATS:
580
+ yield ("", gr.update(value="⚠️ Invalid model: unknown format.", visible=True))
581
+ return
582
+ if len(description) > PROMPT_MAX_CHARS:
583
+ yield ("", gr.update(value="⚠️ Invalid model: description too long.", visible=True))
584
+ return
585
+ details = ""
586
+ if fmt == "multiple choice":
587
+ options = [str(c).strip() for c in (options or []) if str(c).strip()]
588
+ if len(options) < 2:
589
+ yield ("", gr.update(value="⚠️ Invalid model: ‘multiple choice’ requires at least 2 choices.", visible=True))
590
+ return
591
+ for c in options:
592
+ if not is_valid_ascii_identifier(c):
593
+ yield ("", gr.update(value="⚠️ Invalid model: choices must match [A-Za-z][A-Za-z0-9_-]* with no spaces or accents.", visible=True))
594
+ return
595
+ normalized = [c.lower() for c in options]
596
+ if len(set(normalized)) != len(options):
597
+ yield ("", gr.update(value="⚠️ Invalid model: choices must be unique.", visible=True))
598
+ return
599
+ details = "choices: " + " | ".join(options)
600
+ elif fmt == "unit":
601
+ unit = unit.strip()
602
+ if unit:
603
+ details = f"unit: {unit}"
604
+ cleaned.append({
605
+ "name": name,
606
+ "format": fmt,
607
+ "description": description,
608
+ "details": details,
609
+ "options": options if fmt == "multiple choice" else [],
610
+ "unit": unit if fmt == "unit" else "",
611
+ })
612
+ selected_fields = cleaned
613
+ else:
614
+ yield ("", error_update("⚠️ Model file not found."))
615
+ return
616
+ # 2) File uploaded in Model tab
617
+ elif model_file_modeltab:
618
+ path = model_file_modeltab if isinstance(model_file_modeltab, str) else model_file_modeltab.get("path")
619
+ if path and os.path.exists(path):
620
+ with open(path, "r", encoding="utf-8") as f:
621
+ data = json.load(f)
622
+ raw_fields = data.get("fields", []) if isinstance(data, dict) else []
623
+ # normalize formats to English for internal use
624
+ selected_fields = []
625
+ for item in raw_fields:
626
+ item = dict(item)
627
+ item["format"] = normalize_format_label(item.get("format"))
628
+ selected_fields.append(item)
629
+ else:
630
+ yield ("", error_update("⚠️ Model file not found."))
631
+ return
632
+ # 3) Model built manually (state)
633
+ else:
634
+ # normalize possible legacy French formats in state
635
+ selected_fields = []
636
+ for item in (fields_state or []):
637
+ obj = dict(item)
638
+ obj["format"] = normalize_format_label(obj.get("format"))
639
+ selected_fields.append(obj)
640
+ if not selected_fields:
641
+ yield ("", error_update("⚠️ Model not ready."))
642
+ return
643
+ except Exception:
644
+ yield ("", gr.update(value="⚠️ Invalid model file.", visible=True))
645
+ return
646
+ # Construit instruction et lance appel streaming, renvoie (texte acumulé, statut)
647
+ instruction_text = instruction_from_fields(selected_fields)
648
+ if not instruction_text:
649
+ yield ("", error_update("⚠️ Model not ready."))
650
+ return
651
+ # Choose image source: URL has priority over uploaded file
652
+ image_url = (image_url or "").strip()
653
+ if image_url:
654
+ # N'accepter que des URLs d'images (pas de PDF)
655
+ if not is_image_url(image_url):
656
+ yield ("", error_update("⚠️ Only direct image URLs are allowed (jpg, jpeg, png, gif, webp, bmp, tiff)."))
657
+ return
658
+ final_image_ref = image_url
659
+ else:
660
+ if not image_path:
661
+ yield ("", error_update("⚠️ Provide an image/PDF file or a URL."))
662
+ return
663
+ data_url, err = document_file_to_data_url_with_error(image_path)
664
+ if not data_url:
665
+ msg = err or "Invalid document (image/PDF)."
666
+ yield ("", error_update("⚠️ " + msg))
667
+ return
668
+ final_image_ref = data_url
669
+ try:
670
+ api_key = (hf_token or "").strip() or os.getenv("OPENROUTER_API_KEY", "")
671
+ client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=api_key)
672
+ if not client.api_key:
673
+ yield ("", gr.update(value="⚠️ Missing OPENROUTER_API_KEY environment variable.", visible=True))
674
+ return
675
+ extra_headers = {}
676
+ ref = os.getenv("OPENROUTER_HTTP_REFERER", "").strip()
677
+ ttl = os.getenv("OPENROUTER_X_TITLE", "").strip()
678
+ if ref:
679
+ extra_headers["HTTP-Referer"] = ref
680
+ if ttl:
681
+ extra_headers["X-Title"] = ttl
682
+ model_name = os.getenv("OPENROUTER_MODEL", "openai/gpt-4o")
683
+ stream = client.chat.completions.create(
684
+ extra_headers=extra_headers or None,
685
+ model=model_name,
686
+ messages=[
687
+ {
688
+ "role": "user",
689
+ "content": [
690
+ {"type": "text", "text": instruction_text},
691
+ {"type": "image_url", "image_url": {"url": final_image_ref}},
692
+ ],
693
+ }
694
+ ],
695
+ stream=True,
696
+ )
697
+ collected = ""
698
+ for chunk in stream:
699
+ choices = getattr(chunk, "choices", None)
700
+ if not choices:
701
+ continue
702
+ first = choices[0]
703
+ delta = getattr(first, "delta", None)
704
+ piece = getattr(delta, "content", None) if delta is not None else None
705
+ if piece:
706
+ collected += piece
707
+ yield (collected, gr.update(value="Validating…", visible=True))
708
+ if not collected:
709
+ yield ("", gr.update(value="⚠️ Empty model response.", visible=True))
710
+ else:
711
+ ok, info, normalized = validate_output_against_model(selected_fields, collected)
712
+ if ok:
713
+ msg = "✅ Output matches the model."
714
+ if normalized:
715
+ msg += "\n\nNormalized preview:\n" + normalized
716
+ yield (collected, gr.update(value=msg, visible=True))
717
+ else:
718
+ yield (collected, gr.update(value=f"❌ Output not compliant:\n{info}", visible=True))
719
+ return
720
+ except Exception as e:
721
+ yield ("", gr.update(value=f"⚠️ API call error: {e}", visible=True))
722
+ return
723
+
724
+
725
+ def import_model(uploaded_file):
726
+ try:
727
+ if not uploaded_file:
728
+ return (
729
+ error_update("⚠️ No file provided."),
730
+ [],
731
+ [],
732
+ gr.update(choices=[], value=None, visible=False),
733
+ gr.update(visible=False),
734
+ ready_update_from_fields([]),
735
+ )
736
+ path = uploaded_file if isinstance(uploaded_file, str) else uploaded_file.get("path")
737
+ if not path or not os.path.exists(path):
738
+ return (
739
+ error_update("⚠️ File not found."),
740
+ [],
741
+ [],
742
+ gr.update(choices=[], value=None, visible=False),
743
+ gr.update(visible=False),
744
+ ready_update_from_fields([]),
745
+ )
746
+ with open(path, "r", encoding="utf-8") as f:
747
+ data = json.load(f)
748
+ fields = data.get("fields", []) if isinstance(data, dict) else []
749
+ # basic validation
750
+ cleaned = []
751
+ seen = set()
752
+ for item in fields:
753
+ name = str(item.get("name", "")).strip()
754
+ fmt = str(item.get("format", "")).strip()
755
+ description = str(item.get("description", ""))
756
+ options = item.get("options", []) if isinstance(item, dict) else []
757
+ unit = str(item.get("unit", ""))
758
+ if not name or len(name) > NAME_MAX_CHARS or not is_valid_ascii_identifier(name):
759
+ return (
760
+ error_update("⚠️ Invalid model: field name must match [A-Za-z][A-Za-z0-9_-]* and length limit."),
761
+ [],
762
+ [],
763
+ gr.update(choices=[], value=None, visible=False),
764
+ gr.update(visible=False),
765
+ ready_update_from_fields([]),
766
+ )
767
+ key = name.lower()
768
+ if key in seen:
769
+ return (
770
+ error_update("⚠️ Invalid model: duplicate field names."),
771
+ [],
772
+ [],
773
+ gr.update(choices=[], value=None, visible=False),
774
+ gr.update(visible=False),
775
+ ready_update_from_fields([]),
776
+ )
777
+ seen.add(key)
778
+ fmt = normalize_format_label(fmt)
779
+ if fmt not in FIELD_FORMATS:
780
+ return (
781
+ error_update("⚠️ Invalid model: unknown format."),
782
+ [],
783
+ [],
784
+ gr.update(choices=[], value=None, visible=False),
785
+ gr.update(visible=False),
786
+ ready_update_from_fields([]),
787
+ )
788
+ if len(description) > PROMPT_MAX_CHARS:
789
+ return (
790
+ error_update("⚠️ Invalid model: description too long."),
791
+ [],
792
+ [],
793
+ gr.update(choices=[], value=None, visible=False),
794
+ gr.update(visible=False),
795
+ ready_update_from_fields([]),
796
+ )
797
+ details = ""
798
+ if fmt == "multiple choice":
799
+ options = [str(c).strip() for c in (options or []) if str(c).strip()]
800
+ if len(options) < 2:
801
+ return (
802
+ error_update("⚠️ Invalid model: ‘multiple choice’ requires at least 2 choices."),
803
+ [],
804
+ [],
805
+ gr.update(choices=[], value=None, visible=False),
806
+ gr.update(visible=False),
807
+ ready_update_from_fields([]),
808
+ )
809
+ for c in options:
810
+ if not is_valid_ascii_identifier(c):
811
+ return (
812
+ error_update("⚠️ Invalid model: choices must match [A-Za-z][A-Za-z0-9_-]* with no spaces or accents."),
813
+ [],
814
+ [],
815
+ gr.update(choices=[], value=None, visible=False),
816
+ gr.update(visible=False),
817
+ ready_update_from_fields([]),
818
+ )
819
+ normalized = [c.lower() for c in options]
820
+ if len(set(normalized)) != len(options):
821
+ return (
822
+ error_update("⚠️ Invalid model: choices must be unique."),
823
+ [],
824
+ [],
825
+ gr.update(choices=[], value=None, visible=False),
826
+ gr.update(visible=False),
827
+ ready_update_from_fields([]),
828
+ )
829
+ details = "choices: " + " | ".join(options)
830
+ elif fmt == "unit":
831
+ unit = unit.strip()
832
+ if unit:
833
+ details = f"unit: {unit}"
834
+ cleaned.append({
835
+ "name": name,
836
+ "format": fmt,
837
+ "description": description,
838
+ "details": details,
839
+ "options": options if fmt == "multiple choice" else [],
840
+ "unit": unit if fmt == "unit" else "",
841
+ })
842
+ return (
843
+ gr.update(value="", visible=False),
844
+ cleaned,
845
+ fields_to_rows(cleaned),
846
+ gr.update(choices=names_from_fields(cleaned), value=None, visible=len(cleaned) > 0),
847
+ gr.update(visible=len(cleaned) > 0),
848
+ ready_update_from_fields(cleaned),
849
+ )
850
+ except Exception:
851
+ return (
852
+ error_update("⚠️ Invalid model file."),
853
+ [],
854
+ [],
855
+ gr.update(choices=[], value=None, visible=False),
856
+ gr.update(visible=False),
857
+ ready_update_from_fields([]),
858
+ )
859
+
860
+
861
+ def ready_update_from_fields(fields):
862
+ ready = len(fields or []) > 0
863
+ if ready:
864
+ return gr.update(value="✅ Model ready. You can proceed to the ‘Extraction’ tab.", visible=True)
865
+ return gr.update(visible=False)
866
+
867
+
868
+ def toggle_conditionals(field_format):
869
+ fmt = normalize_format_label(field_format)
870
+ visible_multi = (fmt == "multiple choice")
871
+ visible_unit = (fmt == "unit")
872
+ return (
873
+ # show/hide: choice input, add button, choices list, unit input, choices error
874
+ gr.update(visible=visible_multi),
875
+ gr.update(visible=visible_multi),
876
+ gr.update(visible=visible_multi),
877
+ gr.update(visible=visible_unit),
878
+ gr.update(visible=visible_multi, value=""),
879
+ )
880
+
881
+
882
+ def update_char_counter(text):
883
+ length = len(text or "")
884
+ return f"{length}/{PROMPT_MAX_CHARS}"
885
+
886
+
887
+ def add_choice(choice, current_choices):
888
+ raw = (choice or "")
889
+ normalized = raw.strip()
890
+ choices = list(current_choices or [])
891
+ existing_norm = {str(c).strip().lower() for c in choices}
892
+ if not normalized:
893
+ rows = [[c] for c in choices]
894
+ return error_update("⚠️ Enter a non-empty choice."), choices, rows, raw
895
+ if not is_valid_ascii_identifier(normalized):
896
+ rows = [[c] for c in choices]
897
+ return error_update("⚠️ Invalid choice: use ASCII letters, digits, '_' or '-'; start with a letter; no spaces or accents."), choices, rows, raw
898
+ if normalized.lower() in existing_norm:
899
+ rows = [[c] for c in choices]
900
+ return error_update("⚠️ This choice already exists."), choices, rows, raw
901
+ choices.append(normalized)
902
+ rows = [[c] for c in choices]
903
+ return gr.update(value="", visible=False), choices, rows, ""
904
+
905
+
906
+ def clear_choices_after_add(error_text, current_choices, current_rows, current_input):
907
+ # Reset only if there is no error message displayed
908
+ text = str(error_text or "").strip()
909
+ if text:
910
+ return current_choices, current_rows, current_input, gr.update()
911
+ return [], gr.update(value=[]), "", gr.update(value="", visible=False)
912
+
913
+
914
+ def build_ui():
915
+ with gr.Blocks(title="Document model builder", analytics_enabled=False) as demo:
916
+ with gr.Tabs():
917
+ with gr.TabItem("Model"):
918
+ gr.Markdown("## Step 1 — Create or load a model")
919
+ gr.Markdown(
920
+ "Use this step to define the fields to extract. "
921
+ "You can either build the model manually or import a .json file. "
922
+ "This model will be used to validate and normalize the response.")
923
+ gr.Markdown("### 1.1 Add a field")
924
+ gr.Markdown(
925
+ "- Name: must be unique and short.\n"
926
+ "- Format: text, date, number, true/false, empty, multiple choice, unit.\n"
927
+ "- Description: short extraction hint (useful examples).")
928
+
929
+ with gr.Row():
930
+ name_input = gr.Textbox(
931
+ label="Field name",
932
+ placeholder="e.g., Accident date",
933
+ info=f"Allowed: [A-Za-z][A-Za-z0-9_-]*, no spaces/accents, max {NAME_MAX_CHARS} chars",
934
+ )
935
+ fmt_input = gr.Dropdown(
936
+ choices=FIELD_FORMATS,
937
+ value="text",
938
+ label="Format",
939
+ )
940
+ desc_input = gr.Textbox(
941
+ label="Description / Prompt",
942
+ placeholder=(
943
+ "E.g., Date when the accident happened. Example: 2021-06-27"
944
+ ),
945
+ lines=3,
946
+ info=f"Max {PROMPT_MAX_CHARS} characters",
947
+ )
948
+ name_live_error = gr.Markdown(visible=False)
949
+
950
+ with gr.Row():
951
+ char_counter = gr.Markdown(f"0/{PROMPT_MAX_CHARS}")
952
+ add_btn = gr.Button("Add +")
953
+ error_box = gr.Markdown(visible=False)
954
+ with gr.Row():
955
+ live_count = gr.Markdown(count_message([]))
956
+
957
+ gr.Markdown("### 1.2 Format options (shown if needed)")
958
+ with gr.Row():
959
+ choice_input = gr.Textbox(
960
+ label="Add a choice",
961
+ placeholder="e.g., yes",
962
+ visible=False,
963
+ info="Same rule as field name: [A-Za-z][A-Za-z0-9_-]*",
964
+ )
965
+ add_choice_btn = gr.Button("Add a choice", visible=False)
966
+ unit_input = gr.Textbox(
967
+ label="Unit(s)",
968
+ placeholder="e.g., €, km, %",
969
+ visible=False,
970
+ )
971
+ choices_live_error = gr.Markdown(visible=False)
972
+ choices_error = gr.Markdown(visible=False)
973
+
974
+ choices_state = gr.State([])
975
+ choices_list = gr.Dataframe(
976
+ headers=["Choices"],
977
+ value=[],
978
+ interactive=False,
979
+ visible=False,
980
+ label="Available choices",
981
+ )
982
+
983
+ gr.Markdown("### 1.3 Model fields (preview)")
984
+ fields_state = gr.State([])
985
+ table = gr.Dataframe(
986
+ headers=["Field name", "Format", "Description", "Details"],
987
+ value=[],
988
+ interactive=False,
989
+ label="Model fields",
990
+ )
991
+
992
+ gr.Markdown("### 1.4 Manage fields")
993
+ with gr.Row():
994
+ delete_dropdown = gr.Dropdown(
995
+ label="Delete a field",
996
+ choices=[],
997
+ value=None,
998
+ visible=False,
999
+ )
1000
+ delete_btn = gr.Button("Delete", variant="stop", visible=False)
1001
+
1002
+ gr.Markdown("### 1.5 Export / Import a model")
1003
+ gr.Markdown(
1004
+ "- Export: generates a reusable .json file.\n"
1005
+ "- Import: loads an existing .json and fills the table above.")
1006
+ with gr.Row():
1007
+ download_btn = gr.Button("Download model", visible=False)
1008
+ model_filename = gr.Textbox(label="Filename", placeholder="e.g., claim_form.json", scale=2, visible=False)
1009
+ file_out = gr.File(label="Model file", visible=False)
1010
+ upload_in = gr.File(label="Upload a model (.json)")
1011
+ ready_msg = gr.Markdown(visible=False)
1012
+
1013
+ fmt_input.change(
1014
+ fn=toggle_conditionals,
1015
+ inputs=[fmt_input],
1016
+ outputs=[choice_input, add_choice_btn, choices_list, unit_input, choices_error],
1017
+ )
1018
+ desc_input.input(
1019
+ fn=update_char_counter,
1020
+ inputs=[desc_input],
1021
+ outputs=[char_counter],
1022
+ )
1023
+ name_input.input(
1024
+ fn=live_validate_field_name,
1025
+ inputs=[name_input],
1026
+ outputs=[name_live_error, add_btn],
1027
+ )
1028
+ add_choice_btn.click(
1029
+ fn=add_choice,
1030
+ inputs=[choice_input, choices_state],
1031
+ outputs=[choices_error, choices_state, choices_list, choice_input],
1032
+ )
1033
+ choice_input.input(
1034
+ fn=live_validate_choice,
1035
+ inputs=[choice_input],
1036
+ outputs=[choices_live_error, add_choice_btn],
1037
+ )
1038
+ add_btn.click(
1039
+ fn=add_field,
1040
+ inputs=[name_input, fmt_input, desc_input, choices_state, unit_input, fields_state],
1041
+ outputs=[error_box, fields_state, table, delete_dropdown, download_btn, ready_msg],
1042
+ )
1043
+ add_btn.click(
1044
+ fn=lambda f: visibility_updates_from_fields(f),
1045
+ inputs=[fields_state],
1046
+ outputs=[delete_dropdown, download_btn, delete_btn, model_filename],
1047
+ )
1048
+ # Après tentative d'ajout, si pas d'erreur (error_box vide), on réinitialise les choix temporaires
1049
+ add_btn.click(
1050
+ fn=clear_choices_after_add,
1051
+ inputs=[error_box, choices_state, choices_list, choice_input],
1052
+ outputs=[choices_state, choices_list, choice_input, choices_error],
1053
+ )
1054
+ # Compteur dynamique
1055
+ add_btn.click(lambda f: count_message(f), inputs=[fields_state], outputs=[live_count])
1056
+ delete_btn.click(lambda f: count_message(f), inputs=[fields_state], outputs=[live_count])
1057
+ upload_in.change(lambda f: count_message(f), inputs=[fields_state], outputs=[live_count])
1058
+ # Pydantic callbacks branch added after components are created below
1059
+ delete_evt = delete_btn.click(
1060
+ fn=delete_field,
1061
+ inputs=[delete_dropdown, fields_state],
1062
+ outputs=[error_box, fields_state, table, delete_dropdown, download_btn, ready_msg],
1063
+ )
1064
+ delete_evt.then(
1065
+ lambda f: visibility_updates_from_fields(f),
1066
+ inputs=[fields_state],
1067
+ outputs=[delete_dropdown, download_btn, delete_btn, model_filename],
1068
+ )
1069
+ download_btn.click(
1070
+ fn=export_model,
1071
+ inputs=[fields_state, model_filename],
1072
+ outputs=[file_out],
1073
+ )
1074
+ import_evt = upload_in.change(
1075
+ fn=import_model,
1076
+ inputs=[upload_in],
1077
+ outputs=[error_box, fields_state, table, delete_dropdown, download_btn, ready_msg],
1078
+ )
1079
+ import_evt.then(lambda f: count_message(f), inputs=[fields_state], outputs=[live_count])
1080
+ import_evt.then(lambda f: visibility_updates_from_fields(f), inputs=[fields_state], outputs=[delete_dropdown, download_btn, delete_btn, model_filename])
1081
+ import_evt.then(lambda f: gr.update(visible=len(f or []) > 0), inputs=[fields_state], outputs=[delete_btn])
1082
+
1083
+ with gr.TabItem("Extract"):
1084
+ gr.Markdown("## Step 2 — Extract fields from the document")
1085
+ gr.Markdown(
1086
+ "Follow the order: 2.1 Auth, 2.2 Model, 2.3 Image, 2.4 Extract.\n"
1087
+ "Model priority: (A) .json uploaded in Extract, (B) .json uploaded in ‘Model’, (C) model built manually.")
1088
+ gr.Markdown("### 2.1 Authentication (OPENROUTER_API_KEY)")
1089
+ with gr.Row():
1090
+ hf_token_input = gr.Textbox(label="OPENROUTER_API_KEY", type="password", placeholder="OpenRouter API key")
1091
+ gr.Markdown("### 2.2 Choose the model to use")
1092
+ gr.Markdown(
1093
+ "- Option A: upload a .json here (priority).\n"
1094
+ "- Option B: use the file imported in the ‘Model’ tab.\n"
1095
+ "- Option C: use the model you built manually (table).")
1096
+ with gr.Row():
1097
+ model_file_input = gr.File(label="Model file (.json) — Extract (optional)")
1098
+ gr.Markdown("### 2.3 Provide the document and run extraction")
1099
+ with gr.Row():
1100
+ img_input = gr.File(label="Document (image/PDF upload)", file_count="single", file_types=["image", ".pdf"], type="filepath")
1101
+ image_url_input = gr.Textbox(label="Or image URL (images only)", placeholder="https://example.com/file.png")
1102
+ extract_btn = gr.Button("Extract", variant="primary")
1103
+ gr.Markdown("### 2.4 Result")
1104
+ with gr.Row():
1105
+ extraction_output = gr.Code(label="Result (stream)", language="json")
1106
+ validation_msg = gr.Markdown(visible=False)
1107
+
1108
+ # Lancer l'extraction; l'ordre des entrées permet 3 cas:
1109
+ # 1) modèle uploadé dans Extraction (prioritaire)
1110
+ # 2) modèle uploadé dans l'onglet Modèle
1111
+ # 3) modèle construit manuellement (fields_state)
1112
+ extract_btn.click(
1113
+ fn=run_extraction,
1114
+ inputs=[model_file_input, upload_in, fields_state, img_input, image_url_input, hf_token_input],
1115
+ outputs=[extraction_output, validation_msg],
1116
+ concurrency_limit=2,
1117
+ api_name="extract",
1118
+ )
1119
+
1120
+ # Synchronisation des fichiers modèle entre onglets
1121
+ # Quand on charge dans Extraction, répliquer vers l'onglet Modèle
1122
+ model_file_input.change(lambda f: f, inputs=[model_file_input], outputs=[upload_in])
1123
+ # Quand on charge dans Modèle, répliquer vers l'onglet Extraction
1124
+ import_evt.then(lambda f: f, inputs=[upload_in], outputs=[model_file_input])
1125
+
1126
+ # Activer la file d'attente (sans paramètre déprécié)
1127
+ demo.queue()
1128
+ return demo
1129
+
1130
+
1131
+ def main():
1132
+ demo = build_ui()
1133
+ demo.launch()
1134
+
1135
+
1136
+ if __name__ == "__main__":
1137
+ main()