Spaces:
Running
Running
| """File processing utilities for NER annotation.""" | |
| import os | |
| import json | |
| import pandas as pd | |
| from typing import List, Dict, Union, Optional | |
| from .text_processing import tokenize_text, process_text_for_gliner | |
| def process_uploaded_file(file_obj) -> List[str]: | |
| """Process an uploaded file into a list of sentences. | |
| Args: | |
| file_obj: The uploaded file object | |
| Returns: | |
| List of processed sentences | |
| Raises: | |
| Exception: If file processing fails | |
| """ | |
| if file_obj is None: | |
| raise ValueError("Please upload a file first!") | |
| try: | |
| if file_obj.name.endswith('.csv'): | |
| # Process CSV file | |
| df = pd.read_csv(file_obj.name) | |
| sentences = df['Nội dung'].dropna().tolist() | |
| else: | |
| # Process text file | |
| content = file_obj.read().decode('utf-8') | |
| sentences = [line.strip() for line in content.splitlines() if line.strip()] | |
| # Process each sentence and flatten the list | |
| processed_sentences = [] | |
| for sentence in sentences: | |
| processed_sentences.extend(process_text_for_gliner(sentence)) | |
| return processed_sentences | |
| except Exception as e: | |
| raise Exception(f"Error reading file: {str(e)}") | |
| def load_from_local_file( | |
| file_path: str, | |
| file_format: str = "json" | |
| ) -> List[Dict]: | |
| """Load and convert data from local file in various formats. | |
| Args: | |
| file_path: Path to the file to load | |
| file_format: Format of the file (json, conll, or txt) | |
| Returns: | |
| List of converted examples | |
| Raises: | |
| Exception: If file loading fails | |
| """ | |
| try: | |
| if file_format == "json": | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| if isinstance(data, list): | |
| # If data is already in the correct format | |
| if all("tokenized_text" in item and "ner" in item for item in data): | |
| return data | |
| # Convert from other JSON formats | |
| return _convert_json_format(data) | |
| else: | |
| raise ValueError("JSON file must contain a list of examples") | |
| elif file_format == "conll": | |
| return _load_conll_file(file_path) | |
| elif file_format == "txt": | |
| return _load_txt_file(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_format}") | |
| except Exception as e: | |
| raise Exception(f"Error loading file: {str(e)}") | |
| def _convert_json_format(data: List[Dict]) -> List[Dict]: | |
| """Convert JSON data from various formats to the standard format. | |
| Args: | |
| data: List of examples in various JSON formats | |
| Returns: | |
| List of examples in the standard format | |
| """ | |
| converted_data = [] | |
| for item in data: | |
| if "tokens" in item and "ner_tags" in item: | |
| ner_spans = [] | |
| current_span = None | |
| for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])): | |
| if tag != "O": | |
| if current_span is None: | |
| current_span = [i, i, tag] | |
| elif tag == current_span[2]: | |
| current_span[1] = i | |
| else: | |
| ner_spans.append(current_span) | |
| current_span = [i, i, tag] | |
| elif current_span is not None: | |
| ner_spans.append(current_span) | |
| current_span = None | |
| if current_span is not None: | |
| ner_spans.append(current_span) | |
| converted_data.append({ | |
| "tokenized_text": item["tokens"], | |
| "ner": ner_spans, | |
| "validated": False | |
| }) | |
| return converted_data | |
| def _load_conll_file(file_path: str) -> List[Dict]: | |
| """Load and convert data from a CoNLL format file. | |
| Args: | |
| file_path: Path to the CoNLL file | |
| Returns: | |
| List of converted examples | |
| """ | |
| converted_data = [] | |
| current_example = {"tokens": [], "ner_tags": []} | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| if line.startswith("#"): | |
| continue | |
| parts = line.split() | |
| if len(parts) >= 2: | |
| token, tag = parts[0], parts[-1] | |
| current_example["tokens"].append(token) | |
| current_example["ner_tags"].append(tag) | |
| elif current_example["tokens"]: | |
| # Convert current example | |
| ner_spans = [] | |
| current_span = None | |
| for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])): | |
| if tag != "O": | |
| if current_span is None: | |
| current_span = [i, i, tag] | |
| elif tag == current_span[2]: | |
| current_span[1] = i | |
| else: | |
| ner_spans.append(current_span) | |
| current_span = [i, i, tag] | |
| elif current_span is not None: | |
| ner_spans.append(current_span) | |
| current_span = None | |
| if current_span is not None: | |
| ner_spans.append(current_span) | |
| converted_data.append({ | |
| "tokenized_text": current_example["tokens"], | |
| "ner": ner_spans, | |
| "validated": False | |
| }) | |
| current_example = {"tokens": [], "ner_tags": []} | |
| # Handle last example if exists | |
| if current_example["tokens"]: | |
| ner_spans = [] | |
| current_span = None | |
| for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])): | |
| if tag != "O": | |
| if current_span is None: | |
| current_span = [i, i, tag] | |
| elif tag == current_span[2]: | |
| current_span[1] = i | |
| else: | |
| ner_spans.append(current_span) | |
| current_span = [i, i, tag] | |
| elif current_span is not None: | |
| ner_spans.append(current_span) | |
| current_span = None | |
| if current_span is not None: | |
| ner_spans.append(current_span) | |
| converted_data.append({ | |
| "tokenized_text": current_example["tokens"], | |
| "ner": ner_spans, | |
| "validated": False | |
| }) | |
| return converted_data | |
| def _load_txt_file(file_path: str) -> List[Dict]: | |
| """Load and convert data from a text file. | |
| Args: | |
| file_path: Path to the text file | |
| Returns: | |
| List of converted examples | |
| """ | |
| converted_data = [] | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| tokens = tokenize_text(line) | |
| converted_data.append({ | |
| "tokenized_text": tokens, | |
| "ner": [], | |
| "validated": False | |
| }) | |
| return converted_data |