Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,10 +4,8 @@ import os
|
|
| 4 |
import base64
|
| 5 |
from io import BytesIO
|
| 6 |
import pandas as pd
|
| 7 |
-
|
| 8 |
-
from bs4 import BeautifulSoup
|
| 9 |
import tempfile
|
| 10 |
-
import re # Ajouté pour extraire le tableau Markdown de la réponse complète
|
| 11 |
|
| 12 |
# Initialize Groq client
|
| 13 |
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
|
@@ -15,49 +13,45 @@ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
|
| 15 |
def image_to_base64(image):
|
| 16 |
"""Convert PIL image to base64 string for Groq API."""
|
| 17 |
buffered = BytesIO()
|
| 18 |
-
image.save(buffered, format="JPEG")
|
| 19 |
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 20 |
|
| 21 |
-
def
|
| 22 |
-
"""
|
| 23 |
-
#
|
| 24 |
-
table_pattern = r'(\|.*?\n(?:\|.*?\n)+)'
|
| 25 |
match = re.search(table_pattern, text, re.DOTALL)
|
| 26 |
-
if match:
|
| 27 |
-
return
|
| 28 |
-
return None
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
# Extraire d'abord le tableau pur
|
| 33 |
-
table_md = extract_markdown_table(text)
|
| 34 |
-
if not table_md:
|
| 35 |
-
return None
|
| 36 |
|
| 37 |
-
#
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
def process_image_and_get_response(image):
|
| 63 |
"""Process the uploaded image, send to Groq vision model, parse response to table, and generate Excel."""
|
|
@@ -67,12 +61,18 @@ def process_image_and_get_response(image):
|
|
| 67 |
# Convert image to base64
|
| 68 |
base64_image = image_to_base64(image)
|
| 69 |
|
| 70 |
-
#
|
| 71 |
-
prompt =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
try:
|
| 74 |
completion = client.chat.completions.create(
|
| 75 |
-
model="meta-llama/llama-4-scout-17b-16e-instruct",
|
| 76 |
messages=[
|
| 77 |
{
|
| 78 |
"role": "user",
|
|
@@ -85,30 +85,29 @@ def process_image_and_get_response(image):
|
|
| 85 |
]
|
| 86 |
}
|
| 87 |
],
|
| 88 |
-
temperature=0.5,
|
| 89 |
-
max_completion_tokens=2048,
|
| 90 |
top_p=1,
|
| 91 |
-
stream=False,
|
| 92 |
stop=None
|
| 93 |
)
|
| 94 |
|
| 95 |
-
response = completion.choices[0].message.content
|
| 96 |
|
| 97 |
-
# Parse la réponse en DataFrame
|
| 98 |
df = parse_markdown_table_to_df(response)
|
| 99 |
excel_file = None
|
| 100 |
-
if
|
| 101 |
# Crée un fichier Excel temporaire
|
| 102 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
| 103 |
with pd.ExcelWriter(tmp.name, engine='openpyxl') as writer:
|
| 104 |
df.to_excel(writer, sheet_name='Tableau_Extrait', index=False)
|
| 105 |
excel_file = tmp.name
|
| 106 |
else:
|
| 107 |
-
#
|
| 108 |
-
df_fallback = pd.DataFrame({"Réponse brute": [response]})
|
| 109 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
| 110 |
with pd.ExcelWriter(tmp.name, engine='openpyxl') as writer:
|
| 111 |
-
|
| 112 |
excel_file = tmp.name
|
| 113 |
|
| 114 |
return response, excel_file
|
|
|
|
| 4 |
import base64
|
| 5 |
from io import BytesIO
|
| 6 |
import pandas as pd
|
| 7 |
+
import re
|
|
|
|
| 8 |
import tempfile
|
|
|
|
| 9 |
|
| 10 |
# Initialize Groq client
|
| 11 |
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
|
|
|
| 13 |
def image_to_base64(image):
|
| 14 |
"""Convert PIL image to base64 string for Groq API."""
|
| 15 |
buffered = BytesIO()
|
| 16 |
+
image.save(buffered, format="JPEG")
|
| 17 |
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 18 |
|
| 19 |
+
def parse_markdown_table_to_df(text):
|
| 20 |
+
"""Parse un tableau Markdown en Pandas DataFrame de manière robuste."""
|
| 21 |
+
# Extraire le tableau Markdown avec une regex
|
| 22 |
+
table_pattern = r'(\|.*?\n(?:\|[-| :]*?\n)?(?:\|.*?\n)+)'
|
| 23 |
match = re.search(table_pattern, text, re.DOTALL)
|
| 24 |
+
if not match:
|
| 25 |
+
return pd.DataFrame({"Erreur": ["Aucun tableau Markdown trouvé dans la réponse"]})
|
|
|
|
| 26 |
|
| 27 |
+
table_text = match.group(1).strip()
|
| 28 |
+
lines = table_text.split('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
# Ignorer les lignes vides et la ligne des séparateurs (|---|)
|
| 31 |
+
lines = [line.strip() for line in lines if line.strip() and not re.match(r'\|[-| :]*\|', line)]
|
| 32 |
|
| 33 |
+
if not lines:
|
| 34 |
+
return pd.DataFrame({"Erreur": ["Tableau vide ou mal formé"]})
|
| 35 |
+
|
| 36 |
+
# Extraire les en-têtes
|
| 37 |
+
headers = [h.strip() for h in lines[0].split('|')[1:-1]] # Ignore les | aux extrémités
|
| 38 |
+
if not headers:
|
| 39 |
+
return pd.DataFrame({"Erreur": ["Aucun en-tête valide trouvé"]})
|
| 40 |
+
|
| 41 |
+
# Extraire les lignes de données
|
| 42 |
+
rows = []
|
| 43 |
+
for line in lines[1:]:
|
| 44 |
+
cells = [cell.strip() for cell in line.split('|')[1:-1]] # Ignore les | aux extrémités
|
| 45 |
+
if len(cells) == len(headers): # Vérifie l'alignement
|
| 46 |
+
rows.append(cells)
|
| 47 |
+
else:
|
| 48 |
+
# Si le nombre de colonnes ne correspond pas, remplir avec des chaînes vides
|
| 49 |
+
cells.extend([''] * (len(headers) - len(cells)))
|
| 50 |
+
rows.append(cells[:len(headers)])
|
| 51 |
+
|
| 52 |
+
# Créer le DataFrame
|
| 53 |
+
df = pd.DataFrame(rows, columns=headers)
|
| 54 |
+
return df if not df.empty else pd.DataFrame({"Erreur": ["Aucune donnée valide extraite"]})
|
| 55 |
|
| 56 |
def process_image_and_get_response(image):
|
| 57 |
"""Process the uploaded image, send to Groq vision model, parse response to table, and generate Excel."""
|
|
|
|
| 61 |
# Convert image to base64
|
| 62 |
base64_image = image_to_base64(image)
|
| 63 |
|
| 64 |
+
# Prompt optimisé pour un Markdown propre
|
| 65 |
+
prompt = (
|
| 66 |
+
"Extrait le tableau en entier de cette image et recopie-le à l'identique ici au format Markdown. "
|
| 67 |
+
"Utilise des | pour les colonnes et une ligne |---|---| pour les séparateurs. "
|
| 68 |
+
"N'inclus aucun texte avant ou après le tableau. "
|
| 69 |
+
"Assure-toi que chaque ligne a exactement le même nombre de colonnes que les en-têtes, "
|
| 70 |
+
"en remplissant les cellules vides avec '' si nécessaire."
|
| 71 |
+
)
|
| 72 |
|
| 73 |
try:
|
| 74 |
completion = client.chat.completions.create(
|
| 75 |
+
model="meta-llama/llama-4-scout-17b-16e-instruct",
|
| 76 |
messages=[
|
| 77 |
{
|
| 78 |
"role": "user",
|
|
|
|
| 85 |
]
|
| 86 |
}
|
| 87 |
],
|
| 88 |
+
temperature=0.5,
|
| 89 |
+
max_completion_tokens=2048,
|
| 90 |
top_p=1,
|
| 91 |
+
stream=False,
|
| 92 |
stop=None
|
| 93 |
)
|
| 94 |
|
| 95 |
+
response = completion.choices[0].message.content.strip()
|
| 96 |
|
| 97 |
+
# Parse la réponse en DataFrame
|
| 98 |
df = parse_markdown_table_to_df(response)
|
| 99 |
excel_file = None
|
| 100 |
+
if not df.empty and "Erreur" not in df.columns:
|
| 101 |
# Crée un fichier Excel temporaire
|
| 102 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
| 103 |
with pd.ExcelWriter(tmp.name, engine='openpyxl') as writer:
|
| 104 |
df.to_excel(writer, sheet_name='Tableau_Extrait', index=False)
|
| 105 |
excel_file = tmp.name
|
| 106 |
else:
|
| 107 |
+
# Si parsing échoue, crée un Excel avec un message d'erreur
|
|
|
|
| 108 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
| 109 |
with pd.ExcelWriter(tmp.name, engine='openpyxl') as writer:
|
| 110 |
+
df.to_excel(writer, sheet_name='Erreur', index=False)
|
| 111 |
excel_file = tmp.name
|
| 112 |
|
| 113 |
return response, excel_file
|