kebson commited on
Commit
71b8f4e
·
verified ·
1 Parent(s): d4e0cc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -24
app.py CHANGED
@@ -1,13 +1,12 @@
1
  import gradio as gr
2
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
3
  from PIL import Image
4
- import re
5
  import torch
6
 
7
  # ===============================
8
  # Charger le modèle TrOCR public
9
  # ===============================
10
- model_name = "microsoft/trocr-base-handwritten" # modèle public
11
  processor = TrOCRProcessor.from_pretrained(model_name)
12
  model = VisionEncoderDecoderModel.from_pretrained(model_name)
13
 
@@ -26,31 +25,21 @@ def extract_description(image_pil):
26
  # Séparer le texte en lignes
27
  lines = [line.strip() for line in ocr_text.split("\n") if line.strip()]
28
 
29
- # Identifier la colonne Description
30
- desc_col = []
31
- header_found = False
32
- headers = []
33
 
34
- if lines:
35
- first_line = lines[0]
36
- # Split en colonnes par tabulation ou espaces multiples
37
- headers = re.split(r"\t+|\s{2,}", first_line)
38
- try:
39
- desc_index = next(i for i, h in enumerate(headers) if "description" in h.lower())
40
- header_found = True
41
- except StopIteration:
42
- desc_index = None
43
 
44
- # Extraire les valeurs sous la colonne Description
45
- if header_found:
46
- for line in lines[1:]:
47
- cols = re.split(r"\t+|\s{2,}", line)
48
- if desc_index is not None and desc_index < len(cols):
49
- desc_col.append(cols[desc_index])
50
- else:
51
  return "❌ Colonne 'Description' non trouvée", ocr_text
52
-
53
- return "\n".join(desc_col), ocr_text
54
 
55
  # ===============================
56
  # Interface Gradio
 
1
  import gradio as gr
2
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
3
  from PIL import Image
 
4
  import torch
5
 
6
  # ===============================
7
  # Charger le modèle TrOCR public
8
  # ===============================
9
+ model_name = "microsoft/trocr-base-handwritten" # modèle OCR général
10
  processor = TrOCRProcessor.from_pretrained(model_name)
11
  model = VisionEncoderDecoderModel.from_pretrained(model_name)
12
 
 
25
  # Séparer le texte en lignes
26
  lines = [line.strip() for line in ocr_text.split("\n") if line.strip()]
27
 
28
+ # Détection de la colonne Description via mot-clé
29
+ desc_lines = []
30
+ found_header = False
 
31
 
32
+ for line in lines:
33
+ if found_header:
34
+ # toutes les lignes après le header sont considérées comme contenu de la colonne
35
+ desc_lines.append(line)
36
+ elif "description" in line.lower():
37
+ found_header = True
 
 
 
38
 
39
+ if not desc_lines:
 
 
 
 
 
 
40
  return "❌ Colonne 'Description' non trouvée", ocr_text
41
+ else:
42
+ return "\n".join(desc_lines), ocr_text
43
 
44
  # ===============================
45
  # Interface Gradio