kebson commited on
Commit
ce6a96f
·
verified ·
1 Parent(s): fe12926

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -64
app.py CHANGED
@@ -3,112 +3,118 @@ import numpy as np
3
  from paddleocr import PaddleOCR
4
  from sklearn.cluster import KMeans
5
 
 
 
 
6
  ocr = PaddleOCR(
7
  use_textline_orientation=True,
8
  lang="fr"
9
  )
10
 
11
- HEADER_EXACT = "DESIGNATIONS"
12
-
13
- def extract_column2_9_lines(image):
 
14
  if image is None:
15
  return "Aucune image fournie."
16
 
17
  img = np.array(image)
18
  result = ocr.predict(img)
19
 
20
- if not result:
21
- return "Aucun texte détecté."
22
 
23
  data = result[0]
24
- texts = data.get("rec_texts", [])
25
- boxes = data.get("dt_polys", [])
26
 
27
  elements = []
28
  for text, box in zip(texts, boxes):
29
  text = text.strip()
30
- if len(text) < 2:
31
  continue
32
- x = np.mean([p[0] for p in box])
33
- y = np.mean([p[1] for p in box])
34
- elements.append((x, y, text))
35
-
36
- if len(elements) < 5:
37
- return "Pas assez de données OCR."
38
-
39
- # --- CLUSTER COLONNES ---
40
- X = np.array([[e[0]] for e in elements])
41
- kmeans = KMeans(n_clusters=min(7, len(elements)//6 + 2), random_state=42, n_init=10)
42
- labels = kmeans.fit_predict(X)
43
-
44
- columns = {}
45
- for (x, y, t), lbl in zip(elements, labels):
46
- columns.setdefault(lbl, []).append((x, y, t))
47
 
48
- # --- COLONNE DESCRIPTION = max texte non numérique ---
49
- def score(col):
50
- return sum(len(t) for _,_,t in col if not any(c.isdigit() for c in t))
51
 
52
- desc_col = max(columns.values(), key=score)
53
- desc_col.sort(key=lambda e: e[1]) # top -> bottom
54
 
55
- # --- LOCALISER L’EN-TÊTE ---
56
- header_index = None
57
- for i, (_, _, t) in enumerate(desc_col):
58
- if t.upper() == HEADER_EXACT:
59
- header_index = i
60
- break
61
 
62
- if header_index is None:
63
- start_index = 0
64
- else:
65
- start_index = header_index + 1
66
 
67
- content = desc_col[start_index:]
 
68
 
69
- # --- SEUIL ADAPTATIF ---
70
- ys = [y for _,y,_ in content]
71
- Y_THRESHOLD = max(22, np.median(np.diff(sorted(ys))) * 1.2) if len(ys) > 1 else 30
72
 
73
- # --- FUSION ---
74
- lines = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  current = ""
76
  last_y = None
 
77
 
78
- for _, y, text in content:
79
  if last_y is None or abs(y - last_y) > Y_THRESHOLD:
80
  if current:
81
- lines.append(current.strip())
82
  current = text
83
  else:
84
  current += " " + text
 
85
  last_y = y
86
 
87
  if current:
88
- lines.append(current.strip())
89
 
90
- # --- NETTOYAGE ---
91
- final = []
92
- for i, l in enumerate(lines):
93
- if i == 0:
94
- final.append(l) # Toujours garder la 1ère vraie ligne
95
- continue
96
- if len(l) < 5:
97
- continue
98
- if sum(c.isdigit() for c in l) > len(l)/2:
99
- continue
100
- final.append(l)
101
 
102
- final = final[:9]
 
103
 
104
- return "\n".join([f"{i+1}. {l}" for i,l in enumerate(final)])
105
 
106
- # --- GRADIO ---
 
 
107
  demo = gr.Interface(
108
- fn=extract_column2_9_lines,
109
  inputs=gr.Image(type="pil", label="Image du tableau"),
110
- outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
111
- title="Extraction fiable de la colonne DESIGNATIONS",
 
112
  )
113
 
114
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  from paddleocr import PaddleOCR
4
  from sklearn.cluster import KMeans
5
 
6
+ # -----------------------------
7
+ # OCR
8
+ # -----------------------------
9
  ocr = PaddleOCR(
10
  use_textline_orientation=True,
11
  lang="fr"
12
  )
13
 
14
+ # -----------------------------
15
+ # Extraction de la 2e colonne
16
+ # -----------------------------
17
+ def extract_second_column(image):
18
  if image is None:
19
  return "Aucune image fournie."
20
 
21
  img = np.array(image)
22
  result = ocr.predict(img)
23
 
24
+ if not result or len(result[0]["rec_texts"]) == 0:
25
+ return "OCR exécuté mais aucun texte détecté."
26
 
27
  data = result[0]
28
+ texts = data["rec_texts"]
29
+ boxes = data["dt_polys"]
30
 
31
  elements = []
32
  for text, box in zip(texts, boxes):
33
  text = text.strip()
34
+ if len(text) < 3:
35
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ x_center = np.mean([p[0] for p in box])
38
+ y_center = np.mean([p[1] for p in box])
 
39
 
40
+ elements.append((x_center, y_center, text))
 
41
 
42
+ if len(elements) < 5:
43
+ return "Pas assez de texte détecté."
 
 
 
 
44
 
45
+ # -----------------------------
46
+ # 1. Regroupement en colonnes (par X)
47
+ # -----------------------------
48
+ X = np.array([[e[0]] for e in elements])
49
 
50
+ # Nombre de colonnes estimé automatiquement
51
+ n_cols = min(6, max(2, len(elements) // 6))
52
 
53
+ kmeans = KMeans(n_clusters=n_cols, random_state=42, n_init=10)
54
+ labels = kmeans.fit_predict(X)
 
55
 
56
+ columns = {}
57
+ for (x, y, text), label in zip(elements, labels):
58
+ columns.setdefault(label, []).append((x, y, text))
59
+
60
+ # Trier les colonnes de gauche à droite
61
+ sorted_columns = sorted(
62
+ columns.values(),
63
+ key=lambda col: np.mean([e[0] for e in col])
64
+ )
65
+
66
+ if len(sorted_columns) < 2:
67
+ return "Impossible de détecter la 2e colonne."
68
+
69
+ # -----------------------------
70
+ # 2. Sélection de la 2e colonne
71
+ # -----------------------------
72
+ col = sorted_columns[1]
73
+ col.sort(key=lambda e: e[1]) # top → bottom
74
+
75
+ # -----------------------------
76
+ # 3. Fusion verticale (cellules)
77
+ # -----------------------------
78
+ merged = []
79
  current = ""
80
  last_y = None
81
+ Y_THRESHOLD = 22
82
 
83
+ for _, y, text in col:
84
  if last_y is None or abs(y - last_y) > Y_THRESHOLD:
85
  if current:
86
+ merged.append(current.strip())
87
  current = text
88
  else:
89
  current += " " + text
90
+
91
  last_y = y
92
 
93
  if current:
94
+ merged.append(current.strip())
95
 
96
+ # -----------------------------
97
+ # 4. Nettoyage léger
98
+ # -----------------------------
99
+ final = [
100
+ line for line in merged
101
+ if len(line) > 5
102
+ ]
 
 
 
 
103
 
104
+ if not final:
105
+ return "Colonne détectée mais vide."
106
 
107
+ return "\n".join(f"{i+1}. {l}" for i, l in enumerate(final))
108
 
109
+ # -----------------------------
110
+ # Interface Gradio
111
+ # -----------------------------
112
  demo = gr.Interface(
113
+ fn=extract_second_column,
114
  inputs=gr.Image(type="pil", label="Image du tableau"),
115
+ outputs=gr.Textbox(label="Contenu de la 2e colonne"),
116
+ title="Extraction de la 2e colonne du tableau",
117
+ description="La colonne cible est toujours la deuxième (texte uniquement)"
118
  )
119
 
120
  demo.launch(server_name="0.0.0.0", server_port=7860)