Spaces:
Sleeping
Sleeping
| import urllib.parse | |
| from flask import Flask, request, send_file, render_template, make_response, jsonify | |
| import stanza | |
| from stanza.utils.conll import CoNLL | |
| from conllu import parse_incr | |
| import os | |
| import tempfile | |
| from io import StringIO | |
| app = Flask(__name__) | |
| # Ensure the templates folder is correctly configured | |
| app.template_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates') | |
| # Define the model directory and path | |
| model_dir = os.path.join('models') | |
| model_path = os.path.join(model_dir, 'genipapo.pt') | |
| # Initialize the Stanza pipeline once for reuse | |
| nlp = stanza.Pipeline( | |
| lang='pt', | |
| processors='depparse', | |
| depparse_pretagged=True, | |
| depparse_model_path=model_path, | |
| tokenize_pretokenized=True, | |
| use_gpu=False, | |
| download_method=None | |
| ) | |
| def validate_conllu_file(content): | |
| """ | |
| Validate the .conllu file format and ensure: | |
| 1. Each token line has 10 columns. | |
| 2. POS tags (UPOS) are present and valid. | |
| Forms or lemmas that are "_" issue warnings but do not prevent processing. | |
| """ | |
| errors = [] | |
| warnings = [] | |
| valid_pos_tags = set([ | |
| # Common universal POS tags | |
| "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", | |
| "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X" | |
| ]) | |
| lines = content.strip().split('\n') | |
| line_iter = iter(enumerate(lines, start=1)) | |
| sentence_num = 0 | |
| try: | |
| for sentence in parse_incr(StringIO(content)): | |
| sentence_num += 1 | |
| for token in sentence: | |
| # Find the line corresponding to the current token | |
| while True: | |
| try: | |
| line_num, line = next(line_iter) | |
| except StopIteration: | |
| raise Exception("Unexpected end of content while parsing tokens.") | |
| line = line.strip() | |
| if line == '' or line.startswith('#'): | |
| continue # Skip empty lines and comments | |
| else: | |
| break # Found the token line | |
| columns = line.split('\t') | |
| if len(columns) != 10: | |
| errors.append(f"Line {line_num} of the conllu file: Incorrect number of columns ({len(columns)} found, 10 required).") | |
| continue # Skip further checks for this token | |
| if isinstance(token['id'], int): # Process only word tokens | |
| token_id = token['id'] | |
| form = token.get('form', '').strip() | |
| lemma = token.get('lemma', '').strip() | |
| upos = token.get('upos', '').strip().upper() | |
| if upos == '_': | |
| errors.append(f"Line {line_num}: Missing POS tag (UPOS).") | |
| elif upos not in valid_pos_tags: | |
| errors.append(f"Error on line {line_num} of the conllu file: Invalid POS tag '{upos}'.") | |
| if form == "_": | |
| warnings.append(f"Warning on line {line_num} of the conllu file: Form is empty") | |
| if lemma == "_": | |
| warnings.append(f"Warning on line {line_num} of the conllu file: Lemma is empty") | |
| except Exception as e: | |
| errors.append(f"Parsing error: {str(e)}") | |
| return False, errors, warnings | |
| if errors: | |
| return False, errors, warnings | |
| else: | |
| return True, [], warnings | |
| # Main route for file upload | |
| def upload_file(): | |
| if request.method == 'POST': | |
| # Check if the file is present in the request | |
| if 'file' not in request.files: | |
| return 'No file found in the request.', 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| return 'No file selected.', 400 | |
| if file and file.filename.endswith('.conllu'): | |
| # Read the file content | |
| content = file.read().decode('utf-8') | |
| # Validate the .conllu file | |
| is_valid, errors, warnings = validate_conllu_file(content) | |
| if not is_valid: | |
| # Return validation errors and stop processing | |
| error_message = "Validation failed:<br>" + "<br>".join(errors) | |
| return error_message, 400, {'Content-Type': 'text/html'} | |
| else: | |
| # Optionally, display warnings to the user | |
| if warnings: | |
| warning_message = "Warnings:<br>" + "<br>".join(warnings) | |
| # You can choose to display warnings or log them | |
| print(warning_message) # Or handle as needed | |
| # Save the valid file and process it | |
| input_temp_path = save_temp_file(content) | |
| # Process the file after validation | |
| try: | |
| output_file_path = process_file(input_temp_path, file.filename) | |
| # Create a response object to include headers | |
| response = make_response(send_file(output_file_path, as_attachment=True)) | |
| # Include warnings in the response headers if any | |
| if warnings: | |
| # Join warnings into a single string | |
| warnings_str = '\n'.join(warnings) | |
| # URL-encode the warnings string to safely include in the header | |
| warnings_encoded = urllib.parse.quote(warnings_str) | |
| # Include warnings in a custom header | |
| response.headers['X-Warnings'] = warnings_encoded | |
| return response | |
| except Exception as e: | |
| # Handle unexpected errors in processing | |
| return f"Error during processing: {str(e)}", 500 | |
| else: | |
| return 'Invalid file type. Only .conllu files are allowed.', 400 | |
| # Render the HTML template | |
| return render_template('upload_conllu.html') | |
| def save_temp_file(content): | |
| """ | |
| Save the content to a temporary file and return its path. | |
| """ | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.conllu', mode='w', encoding='utf-8') as input_temp: | |
| input_temp.write(content) | |
| return input_temp.name | |
| def process_file(input_file_path, original_filename): | |
| """ | |
| Process the .conllu file using the Stanza pipeline. | |
| """ | |
| doc = CoNLL.conll2doc(input_file=input_file_path) | |
| parsed_doc = nlp(doc) | |
| for orig_sentence, parsed_sentence in zip(doc.sentences, parsed_doc.sentences): | |
| for orig_word, parsed_word in zip(orig_sentence.words, parsed_sentence.words): | |
| orig_word.head = parsed_word.head | |
| orig_word.deprel = parsed_word.deprel | |
| base_name = os.path.splitext(original_filename)[0] | |
| output_filename = base_name + '_parsed.conllu' | |
| output_file_path = os.path.join(tempfile.gettempdir(), output_filename) | |
| with open(output_file_path, 'w', encoding='utf-8') as f: | |
| f.write("{:C}".format(doc)) | |
| f.write('\n\n') | |
| return output_file_path | |
| def process_api(): | |
| response_format = request.args.get('response_format', 'file') | |
| if 'file' not in request.files: | |
| return jsonify({'error': 'No file part in the request'}), 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| return jsonify({'error': 'No file selected'}), 400 | |
| if not file.filename.endswith('.conllu'): | |
| return jsonify({'error': 'Invalid file type. Only .conllu files are allowed.'}), 400 | |
| content = file.read().decode('utf-8') | |
| is_valid, errors, warnings = validate_conllu_file(content) | |
| if not is_valid: | |
| return jsonify({'status': 'error', 'errors': errors, 'warnings': warnings}), 400 | |
| # Save the valid file and process it | |
| input_temp_path = save_temp_file(content) | |
| try: | |
| output_file_path = process_file(input_temp_path, file.filename) | |
| if response_format == 'json': | |
| # Read the processed content from the file | |
| with open(output_file_path, 'r', encoding='utf-8') as processed_file: | |
| output_content = processed_file.read() | |
| return jsonify({ | |
| 'status': 'success', | |
| 'warnings': warnings, | |
| 'processed_content': output_content | |
| }), 200 | |
| else: | |
| # Return the processed file directly | |
| response = send_file(output_file_path, as_attachment=True, download_name='processed.conllu') | |
| if warnings: | |
| warnings_str = '\n'.join(warnings) | |
| response.headers['X-Warnings'] = urllib.parse.quote(warnings_str) | |
| return response | |
| except Exception as e: | |
| return jsonify({'status': 'error', 'message': str(e)}), 500 | |
| def about(): | |
| return render_template('about.html') | |
| def process_api_json(): | |
| # Check if the request body contains JSON | |
| if not request.is_json: | |
| return jsonify({'error': 'Request body must be JSON'}), 400 | |
| data = request.get_json() | |
| content = data.get('content') | |
| if not content: | |
| return jsonify({'error': 'JSON must include a "content" field with .conllu data'}), 400 | |
| # Validate the .conllu content | |
| is_valid, errors, warnings = validate_conllu_file(content) | |
| if not is_valid: | |
| return jsonify({'status': 'error', 'errors': errors, 'warnings': warnings}), 400 | |
| try: | |
| # Save the valid content to a temporary file | |
| input_temp_path = save_temp_file(content) | |
| output_file_path = process_file(input_temp_path, "input.conllu") | |
| # Read the processed content from the file | |
| with open(output_file_path, 'r', encoding='utf-8') as processed_file: | |
| output_content = processed_file.read() | |
| return jsonify({ | |
| 'status': 'success', | |
| 'warnings': warnings, | |
| 'processed_content': output_content | |
| }), 200 | |
| except Exception as e: | |
| return jsonify({'status': 'error', 'message': str(e)}), 500 | |
| def contact(): | |
| return render_template('contact.html') | |
| def api_guide(): | |
| return render_template('api_guide.html') | |
| if __name__ == '__main__': | |
| # Run the app on port 8000 | |
| app.run(host='0.0.0.0', port=8000) | |