kebson commited on
Commit
d932601
·
verified ·
1 Parent(s): d539c06

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -21
app.py CHANGED
@@ -1,14 +1,25 @@
1
  import gradio as gr
2
  import numpy as np
3
  from paddleocr import PaddleOCR
4
- from sklearn.cluster import KMeans
5
  import re
6
 
7
  ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
8
 
9
  # -------------------------------------------------
10
- # OUTILS TEXTE
11
  # -------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
12
  def is_continuation(text):
13
  t = text.lower().strip()
14
  return (
@@ -16,17 +27,19 @@ def is_continuation(text):
16
  or t.startswith("avec ")
17
  or t.startswith("y compris")
18
  or t.startswith("compr")
 
 
19
  )
20
 
21
  def has_too_many_digits(text):
22
  return sum(c.isdigit() for c in text) > len(text) * 0.4
23
 
24
  def looks_like_designation(text):
25
- if len(text) < 10:
26
  return False
27
  if has_too_many_digits(text):
28
  return False
29
- if re.match(r"^(m2|m3|ml|u|ff)\b", text.lower()):
30
  return False
31
  return True
32
 
@@ -38,35 +51,45 @@ def extract_designations(image):
38
  return "Aucune image fournie."
39
 
40
  img = np.array(image)
41
- result = ocr.predict(img)
42
 
43
- data = result[0]
44
- texts = data["rec_texts"]
45
- boxes = data["dt_polys"]
46
 
47
  lines = []
48
  for text, box in zip(texts, boxes):
49
- text = text.strip()
50
  y = np.mean([p[1] for p in box])
51
- lines.append((y, text))
52
 
53
- # Tri vertical
54
  lines.sort(key=lambda x: x[0])
55
 
56
- # Suppression en-tête
57
- filtered = []
58
- for y, text in lines:
59
- if text.upper().strip() == "DESIGNATIONS":
 
 
 
 
 
60
  continue
61
- filtered.append(text)
 
 
 
 
 
 
 
62
 
63
  # -------------------------------------------------
64
- # FUSION INTELLIGENTE
65
  # -------------------------------------------------
66
  cells = []
67
  current = ""
68
 
69
- for text in filtered:
70
  if not looks_like_designation(text):
71
  continue
72
 
@@ -76,7 +99,7 @@ def extract_designations(image):
76
 
77
  if is_continuation(text):
78
  current += " " + text
79
- elif text[0].isupper() and len(text) > 20:
80
  cells.append(current.strip())
81
  current = text
82
  else:
@@ -85,6 +108,7 @@ def extract_designations(image):
85
  if current:
86
  cells.append(current.strip())
87
 
 
88
  cells = cells[:9]
89
 
90
  if not cells:
@@ -99,8 +123,8 @@ demo = gr.Interface(
99
  fn=extract_designations,
100
  inputs=gr.Image(type="pil", label="Image du tableau"),
101
  outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
102
- title="Extraction fiable de la colonne DESIGNATIONS",
103
- description="Approche textuelle robuste pour devis et bordereaux"
104
  )
105
 
106
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
  import numpy as np
3
  from paddleocr import PaddleOCR
 
4
  import re
5
 
6
  ocr = PaddleOCR(use_textline_orientation=True, lang="fr")
7
 
8
  # -------------------------------------------------
9
+ # FILTRES MÉTIER
10
  # -------------------------------------------------
11
+ def is_title(text):
12
+ t = text.upper()
13
+ keywords = [
14
+ "CADRE DE DEVIS",
15
+ "LOT",
16
+ "AXE",
17
+ "PRIX TOTAL",
18
+ "TVA",
19
+ "TTC"
20
+ ]
21
+ return any(k in t for k in keywords)
22
+
23
  def is_continuation(text):
24
  t = text.lower().strip()
25
  return (
 
27
  or t.startswith("avec ")
28
  or t.startswith("y compris")
29
  or t.startswith("compr")
30
+ or t.startswith("pour ")
31
+ or t.startswith("épaisseur")
32
  )
33
 
34
  def has_too_many_digits(text):
35
  return sum(c.isdigit() for c in text) > len(text) * 0.4
36
 
37
  def looks_like_designation(text):
38
+ if len(text) < 8:
39
  return False
40
  if has_too_many_digits(text):
41
  return False
42
+ if re.match(r"^(m2|m3|ml|ff|u)\b", text.lower()):
43
  return False
44
  return True
45
 
 
51
  return "Aucune image fournie."
52
 
53
  img = np.array(image)
54
+ result = ocr.predict(img)[0]
55
 
56
+ texts = result["rec_texts"]
57
+ boxes = result["dt_polys"]
 
58
 
59
  lines = []
60
  for text, box in zip(texts, boxes):
 
61
  y = np.mean([p[1] for p in box])
62
+ lines.append((y, text.strip()))
63
 
64
+ # tri vertical
65
  lines.sort(key=lambda x: x[0])
66
 
67
+ # -------------------------------------------------
68
+ # ON COMMENCE APRÈS "DESIGNATIONS"
69
+ # -------------------------------------------------
70
+ started = False
71
+ cleaned = []
72
+
73
+ for _, text in lines:
74
+ if text.upper() == "DESIGNATIONS":
75
+ started = True
76
  continue
77
+
78
+ if not started:
79
+ continue
80
+
81
+ if is_title(text):
82
+ continue
83
+
84
+ cleaned.append(text)
85
 
86
  # -------------------------------------------------
87
+ # RECONSTRUCTION DES CELLULES
88
  # -------------------------------------------------
89
  cells = []
90
  current = ""
91
 
92
+ for text in cleaned:
93
  if not looks_like_designation(text):
94
  continue
95
 
 
99
 
100
  if is_continuation(text):
101
  current += " " + text
102
+ elif text[0].isupper():
103
  cells.append(current.strip())
104
  current = text
105
  else:
 
108
  if current:
109
  cells.append(current.strip())
110
 
111
+ # Limite à 9 lignes (LOT 1)
112
  cells = cells[:9]
113
 
114
  if not cells:
 
123
  fn=extract_designations,
124
  inputs=gr.Image(type="pil", label="Image du tableau"),
125
  outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
126
+ title="Extraction fiable de la colonne DESIGNATIONS (V3)",
127
+ description="Filtrage métier + reconstruction intelligente des cellules"
128
  )
129
 
130
  demo.launch(server_name="0.0.0.0", server_port=7860)