Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 3 |
from PIL import Image
|
| 4 |
-
import re
|
| 5 |
import torch
|
| 6 |
|
| 7 |
# ===============================
|
| 8 |
# Charger le modèle TrOCR public
|
| 9 |
# ===============================
|
| 10 |
-
model_name = "microsoft/trocr-base-handwritten" # modèle
|
| 11 |
processor = TrOCRProcessor.from_pretrained(model_name)
|
| 12 |
model = VisionEncoderDecoderModel.from_pretrained(model_name)
|
| 13 |
|
|
@@ -26,31 +25,21 @@ def extract_description(image_pil):
|
|
| 26 |
# Séparer le texte en lignes
|
| 27 |
lines = [line.strip() for line in ocr_text.split("\n") if line.strip()]
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
headers = []
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
header_found = True
|
| 41 |
-
except StopIteration:
|
| 42 |
-
desc_index = None
|
| 43 |
|
| 44 |
-
|
| 45 |
-
if header_found:
|
| 46 |
-
for line in lines[1:]:
|
| 47 |
-
cols = re.split(r"\t+|\s{2,}", line)
|
| 48 |
-
if desc_index is not None and desc_index < len(cols):
|
| 49 |
-
desc_col.append(cols[desc_index])
|
| 50 |
-
else:
|
| 51 |
return "❌ Colonne 'Description' non trouvée", ocr_text
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
# ===============================
|
| 56 |
# Interface Gradio
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 3 |
from PIL import Image
|
|
|
|
| 4 |
import torch
|
| 5 |
|
| 6 |
# ===============================
|
| 7 |
# Charger le modèle TrOCR public
|
| 8 |
# ===============================
|
| 9 |
+
model_name = "microsoft/trocr-base-handwritten" # modèle OCR général
|
| 10 |
processor = TrOCRProcessor.from_pretrained(model_name)
|
| 11 |
model = VisionEncoderDecoderModel.from_pretrained(model_name)
|
| 12 |
|
|
|
|
| 25 |
# Séparer le texte en lignes
|
| 26 |
lines = [line.strip() for line in ocr_text.split("\n") if line.strip()]
|
| 27 |
|
| 28 |
+
# Détection de la colonne Description via mot-clé
|
| 29 |
+
desc_lines = []
|
| 30 |
+
found_header = False
|
|
|
|
| 31 |
|
| 32 |
+
for line in lines:
|
| 33 |
+
if found_header:
|
| 34 |
+
# toutes les lignes après le header sont considérées comme contenu de la colonne
|
| 35 |
+
desc_lines.append(line)
|
| 36 |
+
elif "description" in line.lower():
|
| 37 |
+
found_header = True
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
if not desc_lines:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
return "❌ Colonne 'Description' non trouvée", ocr_text
|
| 41 |
+
else:
|
| 42 |
+
return "\n".join(desc_lines), ocr_text
|
| 43 |
|
| 44 |
# ===============================
|
| 45 |
# Interface Gradio
|