kebson commited on
Commit
e175021
·
verified ·
1 Parent(s): ae7976c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -52
app.py CHANGED
@@ -2,18 +2,18 @@ import gradio as gr
2
  import numpy as np
3
  import unicodedata
4
  from paddleocr import PaddleOCR
5
- from sklearn.cluster import KMeans
6
 
7
  # -------------------------------------------------
8
- # OCR
9
  # -------------------------------------------------
10
  ocr = PaddleOCR(
11
  lang="fr",
12
- use_textline_orientation=True
 
13
  )
14
 
15
  # -------------------------------------------------
16
- # Normalisation texte (casse + accents)
17
  # -------------------------------------------------
18
  def normalize(text: str) -> str:
19
  text = text.lower()
@@ -22,7 +22,7 @@ def normalize(text: str) -> str:
22
  return " ".join(text.split())
23
 
24
  # -------------------------------------------------
25
- # Titres valides de la colonne 2
26
  # -------------------------------------------------
27
  COL_TITLES = {
28
  "designation",
@@ -32,7 +32,7 @@ COL_TITLES = {
32
  }
33
 
34
  # -------------------------------------------------
35
- # Mots / lignes à ignorer
36
  # -------------------------------------------------
37
  IGNORE_KEYWORDS = {
38
  "prix", "total", "ht", "htva", "tva",
@@ -40,72 +40,58 @@ IGNORE_KEYWORDS = {
40
  }
41
 
42
  # -------------------------------------------------
43
- # Fonction principale
44
  # -------------------------------------------------
45
  def extract_second_column(image):
46
  if image is None:
47
  return "Aucune image fournie."
48
 
49
  img = np.array(image)
50
- result = ocr.predict(img)
51
 
52
- if not result:
53
  return "OCR : aucun texte détecté."
54
 
55
- data = result[0]
56
- texts = data.get("rec_texts", [])
57
- boxes = data.get("dt_polys", [])
58
-
59
  blocks = []
60
- for text, box in zip(texts, boxes):
61
- t = text.strip()
62
- if len(t) < 2:
 
 
63
  continue
64
 
65
  x = np.mean([p[0] for p in box])
66
  y = np.mean([p[1] for p in box])
67
-
68
- blocks.append((t, x, y))
69
-
70
- if len(blocks) < 5:
71
- return "Pas assez de texte exploitable."
72
 
73
  # -------------------------------------------------
74
- # 1. Détection du X de la colonne cible via son titre
75
  # -------------------------------------------------
76
- col_x = None
77
  for text, x, y in blocks:
78
  if normalize(text) in COL_TITLES:
79
- col_x = x
80
  break
81
 
82
  if col_x is None:
83
- return "Titre de la colonne cible non détecté."
84
 
85
  # -------------------------------------------------
86
- # 2. Sélection des blocs proches du X détecté
87
  # -------------------------------------------------
88
- X_THRESHOLD = 45
89
  column_blocks = [
90
  (t, x, y) for t, x, y in blocks
91
- if abs(x - col_x) < X_THRESHOLD
92
  ]
93
 
94
- if not column_blocks:
95
- return "Colonne détectée mais vide."
96
-
97
- # -------------------------------------------------
98
- # 3. Tri vertical (haut → bas)
99
- # -------------------------------------------------
100
  column_blocks.sort(key=lambda e: e[2])
101
 
102
  # -------------------------------------------------
103
- # 4. Fusion intelligente des lignes OCR
104
  # -------------------------------------------------
105
  merged = []
106
  current = ""
107
  last_y = None
108
- Y_THRESHOLD = 22
109
 
110
  for text, x, y in column_blocks:
111
  nt = normalize(text)
@@ -113,7 +99,13 @@ def extract_second_column(image):
113
  if any(k in nt for k in IGNORE_KEYWORDS):
114
  continue
115
 
116
- if last_y is None or abs(y - last_y) > Y_THRESHOLD:
 
 
 
 
 
 
117
  if current:
118
  merged.append(current.strip())
119
  current = text
@@ -126,23 +118,19 @@ def extract_second_column(image):
126
  merged.append(current.strip())
127
 
128
  # -------------------------------------------------
129
- # 5. Nettoyage final (cellules texte uniquement)
130
  # -------------------------------------------------
131
  final = []
132
  for line in merged:
133
- nt = normalize(line)
134
- if len(nt) < 4:
135
  continue
136
- if sum(c.isdigit() for c in line) > len(line) / 2:
137
  continue
138
  final.append(line)
139
 
140
  if not final:
141
- return "Aucune cellule texte valide trouvée."
142
 
143
- # -------------------------------------------------
144
- # 6. Résultat numéroté
145
- # -------------------------------------------------
146
  return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
147
 
148
  # -------------------------------------------------
@@ -151,12 +139,12 @@ def extract_second_column(image):
151
  demo = gr.Interface(
152
  fn=extract_second_column,
153
  inputs=gr.Image(type="pil", label="Image du tableau"),
154
- outputs=gr.Textbox(label="Contenu de la colonne 2"),
155
- title="Extraction fiable de la colonne 2 (Désignation / Description)",
156
- description=(
157
- "Extraction robuste de la deuxième colonne des tableaux scannés "
158
- "(Désignation, DESIGNATIONS, Description, Description des services)."
159
- )
160
  )
161
 
162
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
2
  import numpy as np
3
  import unicodedata
4
  from paddleocr import PaddleOCR
 
5
 
6
  # -------------------------------------------------
7
+ # OCR (CONFIG STABLE POUR HUGGING FACE)
8
  # -------------------------------------------------
9
  ocr = PaddleOCR(
10
  lang="fr",
11
+ use_angle_cls=False, # ⛔ désactivation orientation
12
+ show_log=False # silence logs
13
  )
14
 
15
  # -------------------------------------------------
16
+ # Normalisation texte
17
  # -------------------------------------------------
18
  def normalize(text: str) -> str:
19
  text = text.lower()
 
22
  return " ".join(text.split())
23
 
24
  # -------------------------------------------------
25
+ # Titres colonne 2
26
  # -------------------------------------------------
27
  COL_TITLES = {
28
  "designation",
 
32
  }
33
 
34
  # -------------------------------------------------
35
+ # Mots à ignorer
36
  # -------------------------------------------------
37
  IGNORE_KEYWORDS = {
38
  "prix", "total", "ht", "htva", "tva",
 
40
  }
41
 
42
  # -------------------------------------------------
43
+ # Extraction colonne 2
44
  # -------------------------------------------------
45
  def extract_second_column(image):
46
  if image is None:
47
  return "Aucune image fournie."
48
 
49
  img = np.array(image)
50
+ result = ocr.ocr(img, cls=False)
51
 
52
+ if not result or not result[0]:
53
  return "OCR : aucun texte détecté."
54
 
 
 
 
 
55
  blocks = []
56
+ for line in result[0]:
57
+ text = line[1][0].strip()
58
+ box = line[0]
59
+
60
+ if len(text) < 2:
61
  continue
62
 
63
  x = np.mean([p[0] for p in box])
64
  y = np.mean([p[1] for p in box])
65
+ blocks.append((text, x, y))
 
 
 
 
66
 
67
  # -------------------------------------------------
68
+ # 1. Trouver le titre
69
  # -------------------------------------------------
70
+ col_x, title_y = None, None
71
  for text, x, y in blocks:
72
  if normalize(text) in COL_TITLES:
73
+ col_x, title_y = x, y
74
  break
75
 
76
  if col_x is None:
77
+ return "Titre de la colonne non détecté."
78
 
79
  # -------------------------------------------------
80
+ # 2. Filtrage par X + sous le titre
81
  # -------------------------------------------------
 
82
  column_blocks = [
83
  (t, x, y) for t, x, y in blocks
84
+ if abs(x - col_x) < 50 and y > title_y + 15
85
  ]
86
 
 
 
 
 
 
 
87
  column_blocks.sort(key=lambda e: e[2])
88
 
89
  # -------------------------------------------------
90
+ # 3. Fusion contrôlée
91
  # -------------------------------------------------
92
  merged = []
93
  current = ""
94
  last_y = None
 
95
 
96
  for text, x, y in column_blocks:
97
  nt = normalize(text)
 
99
  if any(k in nt for k in IGNORE_KEYWORDS):
100
  continue
101
 
102
+ new_cell = (
103
+ last_y is None
104
+ or abs(y - last_y) > 35
105
+ or text[0].isupper()
106
+ )
107
+
108
+ if new_cell:
109
  if current:
110
  merged.append(current.strip())
111
  current = text
 
118
  merged.append(current.strip())
119
 
120
  # -------------------------------------------------
121
+ # 4. Nettoyage final
122
  # -------------------------------------------------
123
  final = []
124
  for line in merged:
125
+ if not line[0].isupper():
 
126
  continue
127
+ if sum(c.isdigit() for c in line) > len(line) * 0.4:
128
  continue
129
  final.append(line)
130
 
131
  if not final:
132
+ return "Aucune cellule valide trouvée."
133
 
 
 
 
134
  return "\n".join(f"{i+1}. {line}" for i, line in enumerate(final))
135
 
136
  # -------------------------------------------------
 
139
  demo = gr.Interface(
140
  fn=extract_second_column,
141
  inputs=gr.Image(type="pil", label="Image du tableau"),
142
+ outputs=gr.Textbox(label="Contenu colonne 2"),
143
+ title="Extraction colonne Désignation / Description"
 
 
 
 
144
  )
145
 
146
+ demo.launch(
147
+ server_name="0.0.0.0",
148
+ server_port=7860,
149
+ ssr_mode=False
150
+ )