Spaces:

HipFil98
/

ELAN_bot

Sleeping

App Files Files Community

HipFil98 commited on May 2, 2025

Commit

2e2f40a

verified ·

1 Parent(s): 0b2deaa

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -25

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from huggingface_hub import InferenceClient
 import os
 import time
 import asyncio
 # Configure the inference client
@@ -78,12 +79,101 @@ def get_answer(query, context, model="meta-llama/Llama-3.3-70B-Instruct"):
         print(f"Error in response generation: {e}")
         return "I'm sorry, an error occurred while generating the response."
 # Function to modify XML code
-def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct"):
     try:
         client = get_inference_client()
-        PROMPT = """<|start_header_id|>user<|end_header_id|>
                     ## EAF File Structure Reference (Detailed)
@@ -164,13 +254,11 @@ def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct"):
                         - Purpose: Provides standard annotation values for consistent tagging
                     ## Processing Instructions
-                    1. Parse the full XML structure of the provided EAF file
-                    2. Identify all relevant elements and attributes according to the modification requirements
-                    3. Apply the specified modifications with precision
-                    4. Verify that XML structure integrity is maintained
-                    5. Format the output according to XML standards
-                    6. Apply any specific formatting requirements specified by the user
                     ## Output Requirements
                     - Return ONLY the modified EAF content
                     - Maintain proper XML formatting and indentation
@@ -178,23 +266,38 @@ def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct"):
                     - Do not include explanations, commentary, or reasoning in the output
                     - If specific sections should be returned rather than the full document, specify exactly which parts
-                    Provided .eaf file and instructions: {code} <|eot_id|>"""
-        response = client.chat.completions.create(
-            model=model,
-            messages=[
-                {"role": "system", "content": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
-                "You are a linguistic annotation and code expert that helps the user in using an annotation software called ELAN."
-                "An annotation file (eaf) is the document that contains all the information about tiers (their attributes and dependency relations), annotations, and time alignments and links to media files."
-                "Your task is to modify the given eaf file and extract information strictly following the instructions given by the user.<|eot_id|>"},
-                {"role": "user", "content": PROMPT.format(code=eaf_file)},
-                {"role": "assistant", "content": "Here is your output: "}
-            ],
-            temperature=0.6,
-            max_tokens=128000
-        )
-        return response.choices[0].message.content
     except Exception as e:
         print(f"Error in eaf file modification: {e}")
         return "I'm sorry, an error occurred while modifying the eaf file."
@@ -235,4 +338,4 @@ demo = gr.ChatInterface(
 if __name__ == "__main__":
     # Enable built-in Gradio streaming
     demo.queue()
-    demo.launch(share=True)

 import os
 import time
 import asyncio
+import re
 # Configure the inference client
         print(f"Error in response generation: {e}")
         return "I'm sorry, an error occurred while generating the response."
+# Funzioni per suddividere e combinare i chunk EAF
+def split_eaf_content(eaf_content, max_chunk_size=50000):
+    """
+    Suddivide il contenuto EAF in chunk più piccoli preservando la struttura XML.
+    """
+    if len(eaf_content) <= max_chunk_size:
+        return [eaf_content]
+    # Trova tutti gli elementi TIER
+    tier_pattern = re.compile(r'<TIER\s+[^>]*>.*?</TIER>', re.DOTALL)
+    tier_matches = list(tier_pattern.finditer(eaf_content))
+    # Se non ci sono TIER o sono troppo pochi, usa un'altra strategia
+    if not tier_matches:
+        # Suddivisione basata sulla dimensione
+        chunks = []
+        for i in range(0, len(eaf_content), max_chunk_size):
+            chunks.append(eaf_content[i:i+max_chunk_size])
+        return chunks
+    # Estrai l'intestazione XML (tutto prima del primo TIER)
+    header_end = tier_matches[0].start()
+    header = eaf_content[:header_end]
+    # Estrai la parte finale (tutto dopo l'ultimo TIER)
+    footer_start = tier_matches[-1].end()
+    footer = eaf_content[footer_start:]
+    # Suddividi i TIER in chunk
+    chunks = []
+    current_chunk = header
+    for match in tier_matches:
+        tier = match.group(0)
+        # Se aggiungere questo TIER supererebbe la dimensione massima, inizia un nuovo chunk
+        if len(current_chunk) + len(tier) > max_chunk_size:
+            # Aggiungi un'intestazione "fittizia" di chiusura temporanea
+            current_chunk += "</ANNOTATION_DOCUMENT>"
+            chunks.append(current_chunk)
+            # Inizia un nuovo chunk con l'intestazione
+            current_chunk = header + tier
+        else:
+            current_chunk += tier
+    # Aggiungi il footer all'ultimo chunk
+    current_chunk += footer
+    chunks.append(current_chunk)
+    return chunks
+def combine_eaf_chunks(processed_chunks):
+    """
+    Ricombina i chunk elaborati in un singolo file EAF.
+    Rimuove le intestazioni e i footer duplicati.
+    """
+    if len(processed_chunks) == 1:
+        return processed_chunks[0]
+    # Pattern per trovare l'apertura e la chiusura del documento
+    doc_start_pattern = re.compile(r'^.*?<ANNOTATION_DOCUMENT[^>]*>', re.DOTALL)
+    doc_end_pattern = re.compile(r'</ANNOTATION_DOCUMENT>.*?$', re.DOTALL)
+    combined = ""
+    # Per il primo chunk, mantieni l'inizio del documento ma rimuovi la chiusura
+    first_chunk = processed_chunks[0]
+    first_chunk = re.sub(doc_end_pattern, '', first_chunk)
+    combined += first_chunk
+    # Per i chunk intermedi, rimuovi sia l'inizio che la fine
+    for chunk in processed_chunks[1:-1]:
+        chunk = re.sub(doc_start_pattern, '', chunk)
+        chunk = re.sub(doc_end_pattern, '', chunk)
+        combined += chunk
+    # Per l'ultimo chunk, rimuovi l'inizio del documento ma mantieni la chiusura
+    if len(processed_chunks) > 1:
+        last_chunk = processed_chunks[-1]
+        last_chunk = re.sub(doc_start_pattern, '', last_chunk)
+        combined += last_chunk
+    return combined
 # Function to modify XML code
+def modify_xml(eaf_file, model="meta-llama/Llama-3.3-70B-Instruct", max_chunk_size=50000):
     try:
         client = get_inference_client()
+        # Dividi il contenuto EAF in chunk più piccoli
+        chunks = split_eaf_content(eaf_file, max_chunk_size)
+        # Base prompt con istruzioni e struttura del file EAF
+        BASE_PROMPT = """<|start_header_id|>user<|end_header_id|>
                     ## EAF File Structure Reference (Detailed)
                         - Purpose: Provides standard annotation values for consistent tagging
                     ## Processing Instructions
+                    1. Parse the XML chunk provided below
+                    2. This is chunk {current_chunk} of {total_chunks}
+                    3. Apply only the modifications relevant to this chunk
+                    4. Return ONLY the modified XML content for this chunk
                     ## Output Requirements
                     - Return ONLY the modified EAF content
                     - Maintain proper XML formatting and indentation
                     - Do not include explanations, commentary, or reasoning in the output
                     - If specific sections should be returned rather than the full document, specify exactly which parts
+                    Provided .eaf file chunk and instructions: {code} <|eot_id|>"""
+        # Elabora ogni chunk e raccogli i risultati
+        processed_chunks = []
+        for i, chunk in enumerate(chunks):
+            chunk_prompt = BASE_PROMPT.format(
+                current_chunk=i+1,
+                total_chunks=len(chunks),
+                code=chunk
+            )
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
+                    "You are a linguistic annotation and code expert that helps the user in using an annotation software called ELAN."
+                    "An annotation file (eaf) is the document that contains all the information about tiers (their attributes and dependency relations), annotations, and time alignments and links to media files."
+                    "Your task is to modify the given eaf file chunk and extract information strictly following the instructions given by the user.<|eot_id|>"},
+                    {"role": "user", "content": chunk_prompt},
+                    {"role": "assistant", "content": "Here is your output: "}
+                ],
+                temperature=0.6,
+                max_tokens=1024  # Ridotto per stare nei limiti
+            )
+            processed_chunks.append(response.choices[0].message.content)
+        # Ricombina i risultati
+        combined_result = combine_eaf_chunks(processed_chunks)
+        return combined_result
     except Exception as e:
         print(f"Error in eaf file modification: {e}")
         return "I'm sorry, an error occurred while modifying the eaf file."
 if __name__ == "__main__":
     # Enable built-in Gradio streaming
     demo.queue()
+    demo.launch(share=True)