HipFil98 commited on
Commit
2e2f40a
·
verified ·
1 Parent(s): 0b2deaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -25
app.py CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import InferenceClient
5
  import os
6
  import time
7
  import asyncio
 
8
 
9
 
10
  # Configure the inference client
@@ -78,12 +79,101 @@ def get_answer(query, context, model="meta-llama/Llama-3.3-70B-Instruct"):
78
  print(f"Error in response generation: {e}")
79
  return "I'm sorry, an error occurred while generating the response."
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # Function to modify XML code
82
- def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct"):
83
  try:
84
  client = get_inference_client()
85
 
86
- PROMPT = """<|start_header_id|>user<|end_header_id|>
 
 
 
 
87
 
88
  ## EAF File Structure Reference (Detailed)
89
 
@@ -164,13 +254,11 @@ def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct"):
164
  - Purpose: Provides standard annotation values for consistent tagging
165
 
166
  ## Processing Instructions
167
- 1. Parse the full XML structure of the provided EAF file
168
- 2. Identify all relevant elements and attributes according to the modification requirements
169
- 3. Apply the specified modifications with precision
170
- 4. Verify that XML structure integrity is maintained
171
- 5. Format the output according to XML standards
172
- 6. Apply any specific formatting requirements specified by the user
173
-
174
  ## Output Requirements
175
  - Return ONLY the modified EAF content
176
  - Maintain proper XML formatting and indentation
@@ -178,23 +266,38 @@ def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct"):
178
  - Do not include explanations, commentary, or reasoning in the output
179
  - If specific sections should be returned rather than the full document, specify exactly which parts
180
 
181
- Provided .eaf file and instructions: {code} <|eot_id|>"""
182
 
183
- response = client.chat.completions.create(
184
- model=model,
185
- messages=[
186
- {"role": "system", "content": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
187
- "You are a linguistic annotation and code expert that helps the user in using an annotation software called ELAN."
188
- "An annotation file (eaf) is the document that contains all the information about tiers (their attributes and dependency relations), annotations, and time alignments and links to media files."
189
- "Your task is to modify the given eaf file and extract information strictly following the instructions given by the user.<|eot_id|>"},
190
- {"role": "user", "content": PROMPT.format(code=eaf_file)},
191
- {"role": "assistant", "content": "Here is your output: "}
192
- ],
193
- temperature=0.6,
194
- max_tokens=128000
195
- )
196
 
197
- return response.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  except Exception as e:
199
  print(f"Error in eaf file modification: {e}")
200
  return "I'm sorry, an error occurred while modifying the eaf file."
@@ -235,4 +338,4 @@ demo = gr.ChatInterface(
235
  if __name__ == "__main__":
236
  # Enable built-in Gradio streaming
237
  demo.queue()
238
- demo.launch(share=True)
 
5
  import os
6
  import time
7
  import asyncio
8
+ import re
9
 
10
 
11
  # Configure the inference client
 
79
  print(f"Error in response generation: {e}")
80
  return "I'm sorry, an error occurred while generating the response."
81
 
82
+ # Funzioni per suddividere e combinare i chunk EAF
83
+ def split_eaf_content(eaf_content, max_chunk_size=50000):
84
+ """
85
+ Suddivide il contenuto EAF in chunk più piccoli preservando la struttura XML.
86
+ """
87
+ if len(eaf_content) <= max_chunk_size:
88
+ return [eaf_content]
89
+
90
+ # Trova tutti gli elementi TIER
91
+ tier_pattern = re.compile(r'<TIER\s+[^>]*>.*?</TIER>', re.DOTALL)
92
+ tier_matches = list(tier_pattern.finditer(eaf_content))
93
+
94
+ # Se non ci sono TIER o sono troppo pochi, usa un'altra strategia
95
+ if not tier_matches:
96
+ # Suddivisione basata sulla dimensione
97
+ chunks = []
98
+ for i in range(0, len(eaf_content), max_chunk_size):
99
+ chunks.append(eaf_content[i:i+max_chunk_size])
100
+ return chunks
101
+
102
+ # Estrai l'intestazione XML (tutto prima del primo TIER)
103
+ header_end = tier_matches[0].start()
104
+ header = eaf_content[:header_end]
105
+
106
+ # Estrai la parte finale (tutto dopo l'ultimo TIER)
107
+ footer_start = tier_matches[-1].end()
108
+ footer = eaf_content[footer_start:]
109
+
110
+ # Suddividi i TIER in chunk
111
+ chunks = []
112
+ current_chunk = header
113
+
114
+ for match in tier_matches:
115
+ tier = match.group(0)
116
+
117
+ # Se aggiungere questo TIER supererebbe la dimensione massima, inizia un nuovo chunk
118
+ if len(current_chunk) + len(tier) > max_chunk_size:
119
+ # Aggiungi un'intestazione "fittizia" di chiusura temporanea
120
+ current_chunk += "</ANNOTATION_DOCUMENT>"
121
+ chunks.append(current_chunk)
122
+
123
+ # Inizia un nuovo chunk con l'intestazione
124
+ current_chunk = header + tier
125
+ else:
126
+ current_chunk += tier
127
+
128
+ # Aggiungi il footer all'ultimo chunk
129
+ current_chunk += footer
130
+ chunks.append(current_chunk)
131
+
132
+ return chunks
133
+
134
+ def combine_eaf_chunks(processed_chunks):
135
+ """
136
+ Ricombina i chunk elaborati in un singolo file EAF.
137
+ Rimuove le intestazioni e i footer duplicati.
138
+ """
139
+ if len(processed_chunks) == 1:
140
+ return processed_chunks[0]
141
+
142
+ # Pattern per trovare l'apertura e la chiusura del documento
143
+ doc_start_pattern = re.compile(r'^.*?<ANNOTATION_DOCUMENT[^>]*>', re.DOTALL)
144
+ doc_end_pattern = re.compile(r'</ANNOTATION_DOCUMENT>.*?$', re.DOTALL)
145
+
146
+ combined = ""
147
+
148
+ # Per il primo chunk, mantieni l'inizio del documento ma rimuovi la chiusura
149
+ first_chunk = processed_chunks[0]
150
+ first_chunk = re.sub(doc_end_pattern, '', first_chunk)
151
+ combined += first_chunk
152
+
153
+ # Per i chunk intermedi, rimuovi sia l'inizio che la fine
154
+ for chunk in processed_chunks[1:-1]:
155
+ chunk = re.sub(doc_start_pattern, '', chunk)
156
+ chunk = re.sub(doc_end_pattern, '', chunk)
157
+ combined += chunk
158
+
159
+ # Per l'ultimo chunk, rimuovi l'inizio del documento ma mantieni la chiusura
160
+ if len(processed_chunks) > 1:
161
+ last_chunk = processed_chunks[-1]
162
+ last_chunk = re.sub(doc_start_pattern, '', last_chunk)
163
+ combined += last_chunk
164
+
165
+ return combined
166
+
167
  # Function to modify XML code
168
+ def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct", max_chunk_size=50000):
169
  try:
170
  client = get_inference_client()
171
 
172
+ # Dividi il contenuto EAF in chunk più piccoli
173
+ chunks = split_eaf_content(eaf_file, max_chunk_size)
174
+
175
+ # Base prompt con istruzioni e struttura del file EAF
176
+ BASE_PROMPT = """<|start_header_id|>user<|end_header_id|>
177
 
178
  ## EAF File Structure Reference (Detailed)
179
 
 
254
  - Purpose: Provides standard annotation values for consistent tagging
255
 
256
  ## Processing Instructions
257
+ 1. Parse the XML chunk provided below
258
+ 2. This is chunk {current_chunk} of {total_chunks}
259
+ 3. Apply only the modifications relevant to this chunk
260
+ 4. Return ONLY the modified XML content for this chunk
261
+
 
 
262
  ## Output Requirements
263
  - Return ONLY the modified EAF content
264
  - Maintain proper XML formatting and indentation
 
266
  - Do not include explanations, commentary, or reasoning in the output
267
  - If specific sections should be returned rather than the full document, specify exactly which parts
268
 
269
+ Provided .eaf file chunk and instructions: {code} <|eot_id|>"""
270
 
271
+ # Elabora ogni chunk e raccogli i risultati
272
+ processed_chunks = []
 
 
 
 
 
 
 
 
 
 
 
273
 
274
+ for i, chunk in enumerate(chunks):
275
+ chunk_prompt = BASE_PROMPT.format(
276
+ current_chunk=i+1,
277
+ total_chunks=len(chunks),
278
+ code=chunk
279
+ )
280
+
281
+ response = client.chat.completions.create(
282
+ model=model,
283
+ messages=[
284
+ {"role": "system", "content": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
285
+ "You are a linguistic annotation and code expert that helps the user in using an annotation software called ELAN."
286
+ "An annotation file (eaf) is the document that contains all the information about tiers (their attributes and dependency relations), annotations, and time alignments and links to media files."
287
+ "Your task is to modify the given eaf file chunk and extract information strictly following the instructions given by the user.<|eot_id|>"},
288
+ {"role": "user", "content": chunk_prompt},
289
+ {"role": "assistant", "content": "Here is your output: "}
290
+ ],
291
+ temperature=0.6,
292
+ max_tokens=1024 # Ridotto per stare nei limiti
293
+ )
294
+
295
+ processed_chunks.append(response.choices[0].message.content)
296
+
297
+ # Ricombina i risultati
298
+ combined_result = combine_eaf_chunks(processed_chunks)
299
+
300
+ return combined_result
301
  except Exception as e:
302
  print(f"Error in eaf file modification: {e}")
303
  return "I'm sorry, an error occurred while modifying the eaf file."
 
338
  if __name__ == "__main__":
339
  # Enable built-in Gradio streaming
340
  demo.queue()
341
+ demo.launch(share=True)