Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ from huggingface_hub import InferenceClient
|
|
| 5 |
import os
|
| 6 |
import time
|
| 7 |
import asyncio
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
# Configure the inference client
|
|
@@ -78,12 +79,101 @@ def get_answer(query, context, model="meta-llama/Llama-3.3-70B-Instruct"):
|
|
| 78 |
print(f"Error in response generation: {e}")
|
| 79 |
return "I'm sorry, an error occurred while generating the response."
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# Function to modify XML code
|
| 82 |
-
def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct"):
|
| 83 |
try:
|
| 84 |
client = get_inference_client()
|
| 85 |
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
## EAF File Structure Reference (Detailed)
|
| 89 |
|
|
@@ -164,13 +254,11 @@ def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct"):
|
|
| 164 |
- Purpose: Provides standard annotation values for consistent tagging
|
| 165 |
|
| 166 |
## Processing Instructions
|
| 167 |
-
1. Parse the
|
| 168 |
-
2.
|
| 169 |
-
3. Apply the
|
| 170 |
-
4.
|
| 171 |
-
|
| 172 |
-
6. Apply any specific formatting requirements specified by the user
|
| 173 |
-
|
| 174 |
## Output Requirements
|
| 175 |
- Return ONLY the modified EAF content
|
| 176 |
- Maintain proper XML formatting and indentation
|
|
@@ -178,23 +266,38 @@ def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct"):
|
|
| 178 |
- Do not include explanations, commentary, or reasoning in the output
|
| 179 |
- If specific sections should be returned rather than the full document, specify exactly which parts
|
| 180 |
|
| 181 |
-
Provided .eaf file and instructions: {code} <|eot_id|>"""
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
messages=[
|
| 186 |
-
{"role": "system", "content": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
|
| 187 |
-
"You are a linguistic annotation and code expert that helps the user in using an annotation software called ELAN."
|
| 188 |
-
"An annotation file (eaf) is the document that contains all the information about tiers (their attributes and dependency relations), annotations, and time alignments and links to media files."
|
| 189 |
-
"Your task is to modify the given eaf file and extract information strictly following the instructions given by the user.<|eot_id|>"},
|
| 190 |
-
{"role": "user", "content": PROMPT.format(code=eaf_file)},
|
| 191 |
-
{"role": "assistant", "content": "Here is your output: "}
|
| 192 |
-
],
|
| 193 |
-
temperature=0.6,
|
| 194 |
-
max_tokens=128000
|
| 195 |
-
)
|
| 196 |
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
except Exception as e:
|
| 199 |
print(f"Error in eaf file modification: {e}")
|
| 200 |
return "I'm sorry, an error occurred while modifying the eaf file."
|
|
@@ -235,4 +338,4 @@ demo = gr.ChatInterface(
|
|
| 235 |
if __name__ == "__main__":
|
| 236 |
# Enable built-in Gradio streaming
|
| 237 |
demo.queue()
|
| 238 |
-
demo.launch(share=True)
|
|
|
|
| 5 |
import os
|
| 6 |
import time
|
| 7 |
import asyncio
|
| 8 |
+
import re
|
| 9 |
|
| 10 |
|
| 11 |
# Configure the inference client
|
|
|
|
| 79 |
print(f"Error in response generation: {e}")
|
| 80 |
return "I'm sorry, an error occurred while generating the response."
|
| 81 |
|
| 82 |
+
# Funzioni per suddividere e combinare i chunk EAF
|
| 83 |
+
def split_eaf_content(eaf_content, max_chunk_size=50000):
|
| 84 |
+
"""
|
| 85 |
+
Suddivide il contenuto EAF in chunk più piccoli preservando la struttura XML.
|
| 86 |
+
"""
|
| 87 |
+
if len(eaf_content) <= max_chunk_size:
|
| 88 |
+
return [eaf_content]
|
| 89 |
+
|
| 90 |
+
# Trova tutti gli elementi TIER
|
| 91 |
+
tier_pattern = re.compile(r'<TIER\s+[^>]*>.*?</TIER>', re.DOTALL)
|
| 92 |
+
tier_matches = list(tier_pattern.finditer(eaf_content))
|
| 93 |
+
|
| 94 |
+
# Se non ci sono TIER o sono troppo pochi, usa un'altra strategia
|
| 95 |
+
if not tier_matches:
|
| 96 |
+
# Suddivisione basata sulla dimensione
|
| 97 |
+
chunks = []
|
| 98 |
+
for i in range(0, len(eaf_content), max_chunk_size):
|
| 99 |
+
chunks.append(eaf_content[i:i+max_chunk_size])
|
| 100 |
+
return chunks
|
| 101 |
+
|
| 102 |
+
# Estrai l'intestazione XML (tutto prima del primo TIER)
|
| 103 |
+
header_end = tier_matches[0].start()
|
| 104 |
+
header = eaf_content[:header_end]
|
| 105 |
+
|
| 106 |
+
# Estrai la parte finale (tutto dopo l'ultimo TIER)
|
| 107 |
+
footer_start = tier_matches[-1].end()
|
| 108 |
+
footer = eaf_content[footer_start:]
|
| 109 |
+
|
| 110 |
+
# Suddividi i TIER in chunk
|
| 111 |
+
chunks = []
|
| 112 |
+
current_chunk = header
|
| 113 |
+
|
| 114 |
+
for match in tier_matches:
|
| 115 |
+
tier = match.group(0)
|
| 116 |
+
|
| 117 |
+
# Se aggiungere questo TIER supererebbe la dimensione massima, inizia un nuovo chunk
|
| 118 |
+
if len(current_chunk) + len(tier) > max_chunk_size:
|
| 119 |
+
# Aggiungi un'intestazione "fittizia" di chiusura temporanea
|
| 120 |
+
current_chunk += "</ANNOTATION_DOCUMENT>"
|
| 121 |
+
chunks.append(current_chunk)
|
| 122 |
+
|
| 123 |
+
# Inizia un nuovo chunk con l'intestazione
|
| 124 |
+
current_chunk = header + tier
|
| 125 |
+
else:
|
| 126 |
+
current_chunk += tier
|
| 127 |
+
|
| 128 |
+
# Aggiungi il footer all'ultimo chunk
|
| 129 |
+
current_chunk += footer
|
| 130 |
+
chunks.append(current_chunk)
|
| 131 |
+
|
| 132 |
+
return chunks
|
| 133 |
+
|
| 134 |
+
def combine_eaf_chunks(processed_chunks):
|
| 135 |
+
"""
|
| 136 |
+
Ricombina i chunk elaborati in un singolo file EAF.
|
| 137 |
+
Rimuove le intestazioni e i footer duplicati.
|
| 138 |
+
"""
|
| 139 |
+
if len(processed_chunks) == 1:
|
| 140 |
+
return processed_chunks[0]
|
| 141 |
+
|
| 142 |
+
# Pattern per trovare l'apertura e la chiusura del documento
|
| 143 |
+
doc_start_pattern = re.compile(r'^.*?<ANNOTATION_DOCUMENT[^>]*>', re.DOTALL)
|
| 144 |
+
doc_end_pattern = re.compile(r'</ANNOTATION_DOCUMENT>.*?$', re.DOTALL)
|
| 145 |
+
|
| 146 |
+
combined = ""
|
| 147 |
+
|
| 148 |
+
# Per il primo chunk, mantieni l'inizio del documento ma rimuovi la chiusura
|
| 149 |
+
first_chunk = processed_chunks[0]
|
| 150 |
+
first_chunk = re.sub(doc_end_pattern, '', first_chunk)
|
| 151 |
+
combined += first_chunk
|
| 152 |
+
|
| 153 |
+
# Per i chunk intermedi, rimuovi sia l'inizio che la fine
|
| 154 |
+
for chunk in processed_chunks[1:-1]:
|
| 155 |
+
chunk = re.sub(doc_start_pattern, '', chunk)
|
| 156 |
+
chunk = re.sub(doc_end_pattern, '', chunk)
|
| 157 |
+
combined += chunk
|
| 158 |
+
|
| 159 |
+
# Per l'ultimo chunk, rimuovi l'inizio del documento ma mantieni la chiusura
|
| 160 |
+
if len(processed_chunks) > 1:
|
| 161 |
+
last_chunk = processed_chunks[-1]
|
| 162 |
+
last_chunk = re.sub(doc_start_pattern, '', last_chunk)
|
| 163 |
+
combined += last_chunk
|
| 164 |
+
|
| 165 |
+
return combined
|
| 166 |
+
|
| 167 |
# Function to modify XML code
|
| 168 |
+
def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct", max_chunk_size=50000):
|
| 169 |
try:
|
| 170 |
client = get_inference_client()
|
| 171 |
|
| 172 |
+
# Dividi il contenuto EAF in chunk più piccoli
|
| 173 |
+
chunks = split_eaf_content(eaf_file, max_chunk_size)
|
| 174 |
+
|
| 175 |
+
# Base prompt con istruzioni e struttura del file EAF
|
| 176 |
+
BASE_PROMPT = """<|start_header_id|>user<|end_header_id|>
|
| 177 |
|
| 178 |
## EAF File Structure Reference (Detailed)
|
| 179 |
|
|
|
|
| 254 |
- Purpose: Provides standard annotation values for consistent tagging
|
| 255 |
|
| 256 |
## Processing Instructions
|
| 257 |
+
1. Parse the XML chunk provided below
|
| 258 |
+
2. This is chunk {current_chunk} of {total_chunks}
|
| 259 |
+
3. Apply only the modifications relevant to this chunk
|
| 260 |
+
4. Return ONLY the modified XML content for this chunk
|
| 261 |
+
|
|
|
|
|
|
|
| 262 |
## Output Requirements
|
| 263 |
- Return ONLY the modified EAF content
|
| 264 |
- Maintain proper XML formatting and indentation
|
|
|
|
| 266 |
- Do not include explanations, commentary, or reasoning in the output
|
| 267 |
- If specific sections should be returned rather than the full document, specify exactly which parts
|
| 268 |
|
| 269 |
+
Provided .eaf file chunk and instructions: {code} <|eot_id|>"""
|
| 270 |
|
| 271 |
+
# Elabora ogni chunk e raccogli i risultati
|
| 272 |
+
processed_chunks = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
+
for i, chunk in enumerate(chunks):
|
| 275 |
+
chunk_prompt = BASE_PROMPT.format(
|
| 276 |
+
current_chunk=i+1,
|
| 277 |
+
total_chunks=len(chunks),
|
| 278 |
+
code=chunk
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
response = client.chat.completions.create(
|
| 282 |
+
model=model,
|
| 283 |
+
messages=[
|
| 284 |
+
{"role": "system", "content": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
|
| 285 |
+
"You are a linguistic annotation and code expert that helps the user in using an annotation software called ELAN."
|
| 286 |
+
"An annotation file (eaf) is the document that contains all the information about tiers (their attributes and dependency relations), annotations, and time alignments and links to media files."
|
| 287 |
+
"Your task is to modify the given eaf file chunk and extract information strictly following the instructions given by the user.<|eot_id|>"},
|
| 288 |
+
{"role": "user", "content": chunk_prompt},
|
| 289 |
+
{"role": "assistant", "content": "Here is your output: "}
|
| 290 |
+
],
|
| 291 |
+
temperature=0.6,
|
| 292 |
+
max_tokens=1024 # Ridotto per stare nei limiti
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
processed_chunks.append(response.choices[0].message.content)
|
| 296 |
+
|
| 297 |
+
# Ricombina i risultati
|
| 298 |
+
combined_result = combine_eaf_chunks(processed_chunks)
|
| 299 |
+
|
| 300 |
+
return combined_result
|
| 301 |
except Exception as e:
|
| 302 |
print(f"Error in eaf file modification: {e}")
|
| 303 |
return "I'm sorry, an error occurred while modifying the eaf file."
|
|
|
|
| 338 |
if __name__ == "__main__":
|
| 339 |
# Enable built-in Gradio streaming
|
| 340 |
demo.queue()
|
| 341 |
+
demo.launch(share=True)
|