kebson commited on
Commit
d3dbd12
·
verified ·
1 Parent(s): d932601

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -38
app.py CHANGED
@@ -3,48 +3,51 @@ import numpy as np
3
  from paddleocr import PaddleOCR
4
  import re
5
 
 
 
 
6
  ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
7
 
8
  # -------------------------------------------------
9
- # FILTRES MÉTIER
10
  # -------------------------------------------------
11
  def is_title(text):
12
  t = text.upper()
13
- keywords = [
14
  "CADRE DE DEVIS",
15
  "LOT",
16
  "AXE",
17
  "PRIX TOTAL",
18
  "TVA",
19
  "TTC"
20
- ]
21
- return any(k in t for k in keywords)
 
 
 
 
 
 
 
22
 
23
  def is_continuation(text):
24
  t = text.lower().strip()
25
  return (
26
- t.startswith("et ")
27
- or t.startswith("avec ")
28
  or t.startswith("y compris")
29
- or t.startswith("compr")
 
 
30
  or t.startswith("pour ")
31
- or t.startswith("épaisseur")
32
  )
33
 
34
- def has_too_many_digits(text):
35
- return sum(c.isdigit() for c in text) > len(text) * 0.4
36
-
37
- def looks_like_designation(text):
38
- if len(text) < 8:
39
- return False
40
- if has_too_many_digits(text):
41
- return False
42
- if re.match(r"^(m2|m3|ml|ff|u)\b", text.lower()):
43
- return False
44
- return True
45
 
46
  # -------------------------------------------------
47
- # EXTRACTION
48
  # -------------------------------------------------
49
  def extract_designations(image):
50
  if image is None:
@@ -56,17 +59,17 @@ def extract_designations(image):
56
  texts = result["rec_texts"]
57
  boxes = result["dt_polys"]
58
 
 
59
  lines = []
60
  for text, box in zip(texts, boxes):
61
  y = np.mean([p[1] for p in box])
62
  lines.append((y, text.strip()))
63
 
64
- # tri vertical
65
  lines.sort(key=lambda x: x[0])
66
 
67
- # -------------------------------------------------
68
- # ON COMMENCE APRÈS "DESIGNATIONS"
69
- # -------------------------------------------------
70
  started = False
71
  cleaned = []
72
 
@@ -81,16 +84,26 @@ def extract_designations(image):
81
  if is_title(text):
82
  continue
83
 
84
- cleaned.append(text)
 
85
 
86
- # -------------------------------------------------
87
- # RECONSTRUCTION DES CELLULES
88
- # -------------------------------------------------
89
  cells = []
90
  current = ""
91
 
92
  for text in cleaned:
93
- if not looks_like_designation(text):
 
 
 
 
 
 
 
 
 
94
  continue
95
 
96
  if not current:
@@ -99,16 +112,20 @@ def extract_designations(image):
99
 
100
  if is_continuation(text):
101
  current += " " + text
102
- elif text[0].isupper():
103
- cells.append(current.strip())
104
- current = text
105
  else:
106
- current += " " + text
 
 
 
 
 
107
 
108
  if current:
109
  cells.append(current.strip())
110
 
111
- # Limite à 9 lignes (LOT 1)
 
 
112
  cells = cells[:9]
113
 
114
  if not cells:
@@ -117,14 +134,14 @@ def extract_designations(image):
117
  return "\n".join(f"{i+1}. {c}" for i, c in enumerate(cells))
118
 
119
  # -------------------------------------------------
120
- # INTERFACE
121
  # -------------------------------------------------
122
  demo = gr.Interface(
123
  fn=extract_designations,
124
  inputs=gr.Image(type="pil", label="Image du tableau"),
125
- outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
126
- title="Extraction fiable de la colonne DESIGNATIONS (V3)",
127
- description="Filtrage métier + reconstruction intelligente des cellules"
128
  )
129
 
130
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  from paddleocr import PaddleOCR
4
  import re
5
 
6
+ # -------------------------------------------------
7
+ # OCR
8
+ # -------------------------------------------------
9
  ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
10
 
11
  # -------------------------------------------------
12
+ # RÈGLES MÉTIER
13
  # -------------------------------------------------
14
  def is_title(text):
15
  t = text.upper()
16
+ return any(k in t for k in [
17
  "CADRE DE DEVIS",
18
  "LOT",
19
  "AXE",
20
  "PRIX TOTAL",
21
  "TVA",
22
  "TTC"
23
+ ])
24
+
25
+ def is_f_start(text):
26
+ # F majuscule = début cellule
27
+ # SAUF F6
28
+ return text.startswith("F") and not text.startswith("F6")
29
+
30
+ def is_f6(text):
31
+ return text.startswith("F6")
32
 
33
  def is_continuation(text):
34
  t = text.lower().strip()
35
  return (
36
+ t.startswith("avec")
37
+ or t.startswith("et ")
38
  or t.startswith("y compris")
39
+ or t.startswith("compris")
40
+ or t.startswith("basse")
41
+ or t.startswith("franchissable")
42
  or t.startswith("pour ")
43
+ or t.startswith("f6")
44
  )
45
 
46
+ def looks_like_text(text):
47
+ return len(text) >= 4 and not re.match(r"^\d+$", text)
 
 
 
 
 
 
 
 
 
48
 
49
  # -------------------------------------------------
50
+ # EXTRACTION PRINCIPALE
51
  # -------------------------------------------------
52
  def extract_designations(image):
53
  if image is None:
 
59
  texts = result["rec_texts"]
60
  boxes = result["dt_polys"]
61
 
62
+ # Tri vertical
63
  lines = []
64
  for text, box in zip(texts, boxes):
65
  y = np.mean([p[1] for p in box])
66
  lines.append((y, text.strip()))
67
 
 
68
  lines.sort(key=lambda x: x[0])
69
 
70
+ # -----------------------------
71
+ # APRÈS "DESIGNATIONS"
72
+ # -----------------------------
73
  started = False
74
  cleaned = []
75
 
 
84
  if is_title(text):
85
  continue
86
 
87
+ if looks_like_text(text):
88
+ cleaned.append(text)
89
 
90
+ # -----------------------------
91
+ # CONSTRUCTION DES CELLULES
92
+ # -----------------------------
93
  cells = []
94
  current = ""
95
 
96
  for text in cleaned:
97
+ # F MAJUSCULE (≠ F6) → NOUVELLE CELLULE
98
+ if is_f_start(text):
99
+ if current:
100
+ cells.append(current.strip())
101
+ current = text
102
+ continue
103
+
104
+ # F6 → CONTINUATION FORCÉE
105
+ if is_f6(text):
106
+ current += " " + text
107
  continue
108
 
109
  if not current:
 
112
 
113
  if is_continuation(text):
114
  current += " " + text
 
 
 
115
  else:
116
+ # Nouvelle cellule logique (changement fort)
117
+ if text[0].isupper() and len(current) > 25:
118
+ cells.append(current.strip())
119
+ current = text
120
+ else:
121
+ current += " " + text
122
 
123
  if current:
124
  cells.append(current.strip())
125
 
126
+ # -----------------------------
127
+ # SORTIE
128
+ # -----------------------------
129
  cells = cells[:9]
130
 
131
  if not cells:
 
134
  return "\n".join(f"{i+1}. {c}" for i, c in enumerate(cells))
135
 
136
  # -------------------------------------------------
137
+ # INTERFACE GRADIO
138
  # -------------------------------------------------
139
  demo = gr.Interface(
140
  fn=extract_designations,
141
  inputs=gr.Image(type="pil", label="Image du tableau"),
142
+ outputs=gr.Textbox(label="Colonne DESIGNATIONS (V7)"),
143
+ title="Extraction fiable de la colonne DESIGNATIONS",
144
+ description="Règle F majuscule respectée F6 = continuation (cellule 7)"
145
  )
146
 
147
  demo.launch(server_name="0.0.0.0", server_port=7860)