Bryan Khelven commited on
Commit
ffdedc7
·
1 Parent(s): 3e3bf83

Initial deploy

Browse files
.gitattributes CHANGED
@@ -1,35 +1,13 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
 
 
 
 
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # LFS-managed binary types
2
+ *.jpg filter=lfs diff=lfs merge=lfs -text
3
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
4
+ *.png filter=lfs diff=lfs merge=lfs -text
5
+ *.gif filter=lfs diff=lfs merge=lfs -text
6
+ *.zip filter=lfs diff=lfs merge=lfs -text
7
  *.bin filter=lfs diff=lfs merge=lfs -text
8
+ *.pt filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
9
  *.onnx filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+
12
+ # Treat everything else as text (auto-normalize line endings)
13
+ * text=auto
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a PyTorch base image with torch already installed
2
+ FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime
3
+
4
+ # Install Stanza
5
+ RUN pip install stanza>=1.2
6
+
7
+ #Define workdir
8
+ WORKDIR /app
9
+
10
+ # Copy dependency files and install
11
+ COPY requirements.txt /app/requirements.txt
12
+ RUN pip install -r requirements.txt
13
+
14
+ # Copie os Stanza resources
15
+ COPY stanza_resources /root/stanza_resources
16
+
17
+ # Copy the static folder (including jeni.jpg)
18
+ COPY static /app/static
19
+
20
+ # Copy the remain code
21
+ COPY . /app
22
+
23
+ # Configure the start command
24
+ CMD ["gunicorn", "--bind", "0.0.0.0:${PORT}", "--timeout", "1200", "app:app"]
README.md CHANGED
@@ -1,11 +1,174 @@
 
 
 
 
1
  ---
2
- title: Genipapo Parser
3
- emoji: 😻
4
- colorFrom: pink
5
- colorTo: gray
6
- sdk: docker
7
- pinned: false
8
- short_description: IA multigenre dependency parser for Brazilian Portuguese
 
 
 
 
 
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Genipapo Web
2
+
3
+ **Genipapo Web** is a lightweight web-based interface for the **Genipapo Parser**, enabling users to validate and process `.conllu` files directly in their browser. This repository simplifies the deployment of the Genipapo Parser's web version using Docker.
4
+
5
  ---
6
+
7
+ ## Purpose
8
+
9
+ This project provides an accessible interface for the **Genipapo Parser**, allowing users to:
10
+
11
+ - **Validate and Parse** `.conllu` files directly in their web browser.
12
+ - **Easily Deploy** the parser via Docker, without requiring a complex local setup.
13
+ - **Build a local API version** of the parser, allowing local requisitions in a faster manner.
14
+
15
+ For details on the **Genipapo Parser** itself, visit the main repository:
16
+
17
+ [Genipapo Parser GitHub Repository](https://github.com/bryankhelven/genipapo)
18
+
19
  ---
20
 
21
+ ## Features
22
+
23
+ 1. **Web-Based Interface**:
24
+ - Upload `.conllu` files for validation and parsing.
25
+ - Download parsed files with updated dependency relations.
26
+ - View warnings and errors for `.conllu` file validation.
27
+
28
+ 2. **Dockerized Deployment**:
29
+ - Simplified setup with a single Docker command.
30
+ - No local installation of dependencies required.
31
+
32
+ 3. **Reference to Genipapo Parser**:
33
+ - Built on the Genipapo Parser, a multigenre dependency parser for Brazilian Portuguese.
34
+
35
+ ---
36
+
37
+ ## Prerequisites
38
+
39
+ - **Docker**: Ensure Docker is installed on your system. [Download Docker](https://www.docker.com/products/docker-desktop)
40
+ - **Python 3.7+** (only needed to prepare resources before building the Docker image)
41
+
42
+ ---
43
+
44
+ ## Installation and Setup
45
+
46
+ ### 1. Clone the Repository
47
+
48
+ ```bash
49
+ git clone https://github.com/bryankhelven/genipapo_web.git
50
+ cd genipapo_web
51
+ ```
52
+
53
+ ### 2. Download Resources
54
+
55
+ Run the following script to download the necessary resources and models:
56
+
57
+ ```bash
58
+ python download_resources.py
59
+ ```
60
+
61
+ This will place the resources and model files in their respective folders:
62
+ - `stanza_resources/`
63
+ - `models/`
64
+
65
+ ### 3. Build the Docker Image
66
+
67
+ Build the Docker image using the following command:
68
+
69
+ ```bash
70
+ docker build -t genipapo-web .
71
+ ```
72
+
73
+ ### 4. Run the Docker Container
74
+
75
+ Run the container and expose the application on port `8000`:
76
+
77
+ ```bash
78
+ docker run -it -p 8000:8000 genipapo-web
79
+ ```
80
+
81
+ ### 5. Access the Application
82
+
83
+ Open your browser and navigate to:
84
+
85
+ ```text
86
+ http://localhost:8000/
87
+ ```
88
+
89
+ ---
90
+
91
+ ## API Usage
92
+
93
+ ### Endpoints
94
+
95
+ - **POST /api/process** - Process a `.conllu` file.
96
+ - **POST /api/process/json** - Process raw `.conllu` content in JSON format.
97
+
98
+ ### 1. Process a File
99
+
100
+ Use the `/api/process` endpoint to upload a `.conllu` file.
101
+
102
+ #### Parameters:
103
+
104
+ - **response_format** (optional): Set to `json` to return processed content as JSON. Defaults to `file`.
105
+
106
+ #### Example: Returning a File
107
+
108
+ When `response_format` is set to `file`, the processed content is returned as a downloadable `.conllu` file.
109
+
110
+ ```bash
111
+ curl -X POST -H "Content-Type: multipart/form-data" \
112
+ -F "file=@example.conllu" \
113
+ "http://localhost:8000/api/process?response_format=file" \
114
+ --output processed_example.conllu
115
+ ```
116
+
117
+ #### Example: Returning JSON
118
+
119
+ When `response_format` is set to `json`, the processed content is returned in JSON format.
120
+
121
+ ```bash
122
+ curl -X POST -H "Content-Type: multipart/form-data" \
123
+ -F "file=@example.conllu" \
124
+ "http://localhost:8000/api/process?response_format=json"
125
+ ```
126
+
127
+ Example JSON Response:
128
+
129
+ ```json
130
+ {
131
+ "status": "success",
132
+ "warnings": [],
133
+ "processed_content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capit\u00e3o Am\u00e9rica tamb\u00e9m bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tdet\t_\t_\n2\tCapit\u00e3o\tCapit\u00e3o\tPROPN\t_\t_\t5\tnsubj\t_\t_\n3\tAm\u00e9rica\tAm\u00e9rica\tPROPN\t_\t_\t2\tflat:name\t_\t_\n4\ttamb\u00e9m\ttamb\u00e9m\tADV\t_\t_\t5\tadvmod\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t7\tdet\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t5\tobj\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t5\tpunct\t_\tSpaceAfter=No\n"
134
+ }
135
+ ```
136
+
137
+ ### 2. Process Raw Content
138
+
139
+ Use the `/api/process/json` endpoint to send raw CoNLL-U content as JSON.
140
+
141
+ #### Example:
142
+
143
+ ```bash
144
+ curl -X POST -H "Content-Type: application/json" \
145
+ -d '{"content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capit\u00e3o Am\u00e9rica tamb\u00e9m bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t_\t_\t_\t_\n2\tCapit\u00e3o\tCapit\u00e3o\tPROPN\t_\t_\t_\t_\t_\t_\n3\tAm\u00e9rica\tAm\u00e9rica\tPROPN\t_\t_\t_\t_\t_\t_\n4\ttamb\u00e9m\ttamb\u00e9m\tADV\t_\t_\t_\t_\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t_\t_\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t_\t_\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t_\t_\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t_\t_\t_\tSpaceAfter=No"}' \
146
+ "http://localhost:8000/api/process/json"
147
+ ```
148
+
149
+ Example JSON Response:
150
+
151
+ ```json
152
+ {
153
+ "status": "success",
154
+ "warnings": [],
155
+ "processed_content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capit\u00e3o Am\u00e9rica tamb\u00e9m bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tdet\t_\t_\n2\tCapit\u00e3o\tCapit\u00e3o\tPROPN\t_\t_\t5\tnsubj\t_\t_\n3\tAm\u00e9rica\tAm\u00e9rica\tPROPN\t_\t_\t2\tflat:name\t_\t_\n4\ttamb\u00e9m\ttamb\u00e9m\tADV\t_\t_\t5\tadvmod\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t7\tdet\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t5\tobj\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t5\tpunct\t_\tSpaceAfter=No\n"
156
+ }
157
+ ```
158
+
159
+ ---
160
+
161
+ ## Acknowledgments
162
+
163
+ - This work was carried out at the [Center for Artificial Intelligence of the University of São Paulo (C4AI)](http://c4ai.inova.usp.br/), supported by the São Paulo Research Foundation (FAPESP grant #2019/07665-4) and the IBM Corporation.
164
+ - The project was supported by the Ministry of Science, Technology and Innovation, with resources of Law N. 8.248, of October 23, 1991, within the scope of PPI-SOFTEX, coordinated by Softex and published as Residence in TIC 13, DOU 01245.010222/2022-44.
165
+ - **Genipapo** was developed using the [Stanza library](https://stanfordnlp.github.io/stanza/), courtesy of the Stanford NLP Group.
166
+
167
+ ---
168
+
169
+ ## Contact
170
+
171
+ For inquiries, suggestions, or bug reports, reach out to:
172
+
173
+ - **Email**: [bryankhelven@ieee.org](mailto:bryankhelven@ieee.org)
174
+ - **Main Parser Repository**: [Genipapo Parser](https://github.com/bryankhelven/genipapo)
app.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.parse
2
+ from flask import Flask, request, send_file, render_template, make_response, jsonify
3
+ import stanza
4
+ from stanza.utils.conll import CoNLL
5
+ from conllu import parse_incr
6
+ import os
7
+ import tempfile
8
+ from io import StringIO
9
+
10
+ app = Flask(__name__)
11
+
12
+ # Ensure the templates folder is correctly configured
13
+ app.template_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates')
14
+
15
+ # Define the model directory and path
16
+ model_dir = os.path.join('models')
17
+ model_path = os.path.join(model_dir, 'genipapo.pt')
18
+
19
+ # Initialize the Stanza pipeline once for reuse
20
+ nlp = stanza.Pipeline(
21
+ lang='pt',
22
+ processors='depparse',
23
+ depparse_pretagged=True,
24
+ depparse_model_path=model_path,
25
+ tokenize_pretokenized=True,
26
+ use_gpu=False,
27
+ download_method=None
28
+ )
29
+
30
+ def validate_conllu_file(content):
31
+ """
32
+ Validate the .conllu file format and ensure:
33
+ 1. Each token line has 10 columns.
34
+ 2. POS tags (UPOS) are present and valid.
35
+ Forms or lemmas that are "_" issue warnings but do not prevent processing.
36
+ """
37
+ errors = []
38
+ warnings = []
39
+ valid_pos_tags = set([
40
+ # Common universal POS tags
41
+ "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN",
42
+ "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
43
+ ])
44
+
45
+ lines = content.strip().split('\n')
46
+ line_iter = iter(enumerate(lines, start=1))
47
+ sentence_num = 0
48
+
49
+ try:
50
+ for sentence in parse_incr(StringIO(content)):
51
+ sentence_num += 1
52
+ for token in sentence:
53
+ # Find the line corresponding to the current token
54
+ while True:
55
+ try:
56
+ line_num, line = next(line_iter)
57
+ except StopIteration:
58
+ raise Exception("Unexpected end of content while parsing tokens.")
59
+ line = line.strip()
60
+ if line == '' or line.startswith('#'):
61
+ continue # Skip empty lines and comments
62
+ else:
63
+ break # Found the token line
64
+ columns = line.split('\t')
65
+ if len(columns) != 10:
66
+ errors.append(f"Line {line_num} of the conllu file: Incorrect number of columns ({len(columns)} found, 10 required).")
67
+ continue # Skip further checks for this token
68
+
69
+ if isinstance(token['id'], int): # Process only word tokens
70
+ token_id = token['id']
71
+ form = token.get('form', '').strip()
72
+ lemma = token.get('lemma', '').strip()
73
+ upos = token.get('upos', '').strip().upper()
74
+
75
+ if upos == '_':
76
+ errors.append(f"Line {line_num}: Missing POS tag (UPOS).")
77
+ elif upos not in valid_pos_tags:
78
+ errors.append(f"Error on line {line_num} of the conllu file: Invalid POS tag '{upos}'.")
79
+
80
+ if form == "_":
81
+ warnings.append(f"Warning on line {line_num} of the conllu file: Form is empty")
82
+ if lemma == "_":
83
+ warnings.append(f"Warning on line {line_num} of the conllu file: Lemma is empty")
84
+ except Exception as e:
85
+ errors.append(f"Parsing error: {str(e)}")
86
+ return False, errors, warnings
87
+
88
+ if errors:
89
+ return False, errors, warnings
90
+ else:
91
+ return True, [], warnings
92
+
93
+ # Main route for file upload
94
+ @app.route('/', methods=['GET', 'POST'])
95
+ def upload_file():
96
+ if request.method == 'POST':
97
+ # Check if the file is present in the request
98
+ if 'file' not in request.files:
99
+ return 'No file found in the request.', 400
100
+ file = request.files['file']
101
+ if file.filename == '':
102
+ return 'No file selected.', 400
103
+ if file and file.filename.endswith('.conllu'):
104
+ # Read the file content
105
+ content = file.read().decode('utf-8')
106
+
107
+ # Validate the .conllu file
108
+ is_valid, errors, warnings = validate_conllu_file(content)
109
+ if not is_valid:
110
+ # Return validation errors and stop processing
111
+ error_message = "Validation failed:<br>" + "<br>".join(errors)
112
+ return error_message, 400, {'Content-Type': 'text/html'}
113
+ else:
114
+ # Optionally, display warnings to the user
115
+ if warnings:
116
+ warning_message = "Warnings:<br>" + "<br>".join(warnings)
117
+ # You can choose to display warnings or log them
118
+ print(warning_message) # Or handle as needed
119
+
120
+ # Save the valid file and process it
121
+ input_temp_path = save_temp_file(content)
122
+
123
+ # Process the file after validation
124
+ try:
125
+ output_file_path = process_file(input_temp_path, file.filename)
126
+
127
+ # Create a response object to include headers
128
+ response = make_response(send_file(output_file_path, as_attachment=True))
129
+
130
+ # Include warnings in the response headers if any
131
+ if warnings:
132
+ # Join warnings into a single string
133
+ warnings_str = '\n'.join(warnings)
134
+ # URL-encode the warnings string to safely include in the header
135
+ warnings_encoded = urllib.parse.quote(warnings_str)
136
+ # Include warnings in a custom header
137
+ response.headers['X-Warnings'] = warnings_encoded
138
+ return response
139
+ except Exception as e:
140
+ # Handle unexpected errors in processing
141
+ return f"Error during processing: {str(e)}", 500
142
+ else:
143
+ return 'Invalid file type. Only .conllu files are allowed.', 400
144
+
145
+ # Render the HTML template
146
+ return render_template('upload_conllu.html')
147
+
148
+ def save_temp_file(content):
149
+ """
150
+ Save the content to a temporary file and return its path.
151
+ """
152
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.conllu', mode='w', encoding='utf-8') as input_temp:
153
+ input_temp.write(content)
154
+ return input_temp.name
155
+
156
+ def process_file(input_file_path, original_filename):
157
+ """
158
+ Process the .conllu file using the Stanza pipeline.
159
+ """
160
+ doc = CoNLL.conll2doc(input_file=input_file_path)
161
+ parsed_doc = nlp(doc)
162
+
163
+ for orig_sentence, parsed_sentence in zip(doc.sentences, parsed_doc.sentences):
164
+ for orig_word, parsed_word in zip(orig_sentence.words, parsed_sentence.words):
165
+ orig_word.head = parsed_word.head
166
+ orig_word.deprel = parsed_word.deprel
167
+
168
+ base_name = os.path.splitext(original_filename)[0]
169
+ output_filename = base_name + '_parsed.conllu'
170
+ output_file_path = os.path.join(tempfile.gettempdir(), output_filename)
171
+
172
+ with open(output_file_path, 'w', encoding='utf-8') as f:
173
+ f.write("{:C}".format(doc))
174
+ f.write('\n\n')
175
+
176
+ return output_file_path
177
+
178
+ @app.route('/api/process', methods=['POST'])
179
+ def process_api():
180
+ response_format = request.args.get('response_format', 'file')
181
+
182
+ if 'file' not in request.files:
183
+ return jsonify({'error': 'No file part in the request'}), 400
184
+
185
+ file = request.files['file']
186
+ if file.filename == '':
187
+ return jsonify({'error': 'No file selected'}), 400
188
+
189
+ if not file.filename.endswith('.conllu'):
190
+ return jsonify({'error': 'Invalid file type. Only .conllu files are allowed.'}), 400
191
+
192
+ content = file.read().decode('utf-8')
193
+ is_valid, errors, warnings = validate_conllu_file(content)
194
+ if not is_valid:
195
+ return jsonify({'status': 'error', 'errors': errors, 'warnings': warnings}), 400
196
+
197
+ # Save the valid file and process it
198
+ input_temp_path = save_temp_file(content)
199
+
200
+ try:
201
+ output_file_path = process_file(input_temp_path, file.filename)
202
+
203
+ if response_format == 'json':
204
+ # Read the processed content from the file
205
+ with open(output_file_path, 'r', encoding='utf-8') as processed_file:
206
+ output_content = processed_file.read()
207
+
208
+ return jsonify({
209
+ 'status': 'success',
210
+ 'warnings': warnings,
211
+ 'processed_content': output_content
212
+ }), 200
213
+ else:
214
+ # Return the processed file directly
215
+ response = send_file(output_file_path, as_attachment=True, download_name='processed.conllu')
216
+ if warnings:
217
+ warnings_str = '\n'.join(warnings)
218
+ response.headers['X-Warnings'] = urllib.parse.quote(warnings_str)
219
+ return response
220
+ except Exception as e:
221
+ return jsonify({'status': 'error', 'message': str(e)}), 500
222
+
223
+ @app.route('/about')
224
+ def about():
225
+ return render_template('about.html')
226
+
227
+ @app.route('/api/process/json', methods=['POST'])
228
+ def process_api_json():
229
+ # Check if the request body contains JSON
230
+ if not request.is_json:
231
+ return jsonify({'error': 'Request body must be JSON'}), 400
232
+
233
+ data = request.get_json()
234
+ content = data.get('content')
235
+ if not content:
236
+ return jsonify({'error': 'JSON must include a "content" field with .conllu data'}), 400
237
+
238
+ # Validate the .conllu content
239
+ is_valid, errors, warnings = validate_conllu_file(content)
240
+ if not is_valid:
241
+ return jsonify({'status': 'error', 'errors': errors, 'warnings': warnings}), 400
242
+
243
+ try:
244
+ # Save the valid content to a temporary file
245
+ input_temp_path = save_temp_file(content)
246
+ output_file_path = process_file(input_temp_path, "input.conllu")
247
+
248
+ # Read the processed content from the file
249
+ with open(output_file_path, 'r', encoding='utf-8') as processed_file:
250
+ output_content = processed_file.read()
251
+
252
+ return jsonify({
253
+ 'status': 'success',
254
+ 'warnings': warnings,
255
+ 'processed_content': output_content
256
+ }), 200
257
+ except Exception as e:
258
+ return jsonify({'status': 'error', 'message': str(e)}), 500
259
+
260
+ @app.route('/contact')
261
+ def contact():
262
+ return render_template('contact.html')
263
+
264
+ @app.route('/api_guide')
265
+ def api_guide():
266
+ return render_template('api_guide.html')
267
+
268
+ if __name__ == '__main__':
269
+ # Run the app on port 8000
270
+ app.run(host='0.0.0.0', port=8000)
download_resources.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import hashlib
4
+ import sys
5
+
6
+ def download_genipapo_model():
7
+ # Direct download URL from GitHub Releases
8
+ model_url = 'https://github.com/bryankhelven/genipapo/releases/download/Publishing/genipapo.pt'
9
+ model_dir = os.path.join('models')
10
+ model_path = os.path.join(model_dir, 'genipapo.pt')
11
+
12
+ if not os.path.exists(model_dir):
13
+ os.makedirs(model_dir)
14
+
15
+ if os.path.exists(model_path):
16
+ print("Genipapo model already exists. Verifying checksum...")
17
+ with open(model_path, 'rb') as f:
18
+ data = f.read()
19
+ checksum = hashlib.md5(data).hexdigest()
20
+ if checksum == model_checksum:
21
+ print("Checksum verified. Model is ready to use.")
22
+ return
23
+ else:
24
+ print("Checksum mismatch. Redownloading the model...")
25
+ os.remove(model_path)
26
+
27
+ print("Downloading Genipapo model...")
28
+ response = requests.get(model_url, stream=True)
29
+ if response.status_code != 200:
30
+ print("Failed to download the model. Please check the URL.")
31
+ sys.exit(1)
32
+ with open(model_path, 'wb') as f:
33
+ for chunk in response.iter_content(chunk_size=8192):
34
+ if chunk:
35
+ f.write(chunk)
36
+
37
+ print("Download completed. Model is ready to use.")
38
+
39
+
40
+ # Diretório onde os recursos serão salvos
41
+ RESOURCE_DIR = "stanza_resources"
42
+ LANGUAGE = "pt"
43
+
44
+ # Mapear os componentes necessários com os URLs corrigidos
45
+ REQUIRED_COMPONENTS = {
46
+ "backward_charlm": "https://huggingface.co/stanfordnlp/stanza-pt/resolve/main/models/backward_charlm/oscar2023.pt",
47
+ "forward_charlm": "https://huggingface.co/stanfordnlp/stanza-pt/resolve/main/models/forward_charlm/oscar2023.pt",
48
+ "pretrain": "https://huggingface.co/stanfordnlp/stanza-pt/resolve/main/models/pretrain/conll17.pt",
49
+ }
50
+
51
+ # Função para baixar arquivos com progresso
52
+ def download_file(url, dest_path):
53
+ with requests.get(url, stream=True) as response:
54
+ response.raise_for_status()
55
+ with open(dest_path, "wb") as file:
56
+ for chunk in response.iter_content(chunk_size=8192):
57
+ if chunk:
58
+ file.write(chunk)
59
+
60
+ # Função para baixar recursos específicos
61
+ def download_specific_resources():
62
+ if not os.path.exists(RESOURCE_DIR):
63
+ os.makedirs(RESOURCE_DIR)
64
+
65
+ # Baixar o arquivo `resources.json`
66
+ resources_url = "https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json"
67
+ resources_path = os.path.join(RESOURCE_DIR, "resources.json")
68
+ print("Baixando resources.json...")
69
+ download_file(resources_url, resources_path)
70
+
71
+ # Caminho base para os recursos do idioma
72
+ lang_dir = os.path.join(RESOURCE_DIR, LANGUAGE)
73
+ if not os.path.exists(lang_dir):
74
+ os.makedirs(lang_dir)
75
+
76
+ # Baixar os componentes necessários
77
+ for component, url in REQUIRED_COMPONENTS.items():
78
+ component_dir = os.path.join(lang_dir, component)
79
+ os.makedirs(component_dir, exist_ok=True)
80
+ component_path = os.path.join(component_dir, "model.pt")
81
+ print(f"Baixando {component}...")
82
+ download_file(url, component_path)
83
+ print(f"{component} baixado para {component_path}")
84
+
85
+ print("Download concluído. Recursos disponíveis em:", RESOURCE_DIR)
86
+
87
+
88
+
89
+ if __name__ == '__main__':
90
+ download_genipapo_model()
91
+ download_specific_resources()
92
+
example.conllu ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # sent_id = dante_01_441020223408578560l
2
+ # text = #PETR4 - Análise #Ichimoku - pregão de sexta-feira, 28 de fevereiro. http://t.co/oAHK5pB3e0
3
+ 1 #PETR4 #PETR4 PROPN _ _ _ _ _ _
4
+ 2 - - PUNCT _ _ _ _ _ _
5
+ 3 Análise Análise PROPN _ _ _ _ _ _
6
+ 4 #Ichimoku #Ichimoku PROPN _ _ _ _ _ _
7
+ 5 - - PUNCT _ _ _ _ _ _
8
+ 6 pregão pregão NOUN _ Gender=Masc|Number=Sing _ _ _ _
9
+ 7 de de ADP _ _ _ _ _ _
10
+ 8 sexta-feira sexta-feira NOUN _ Gender=Fem|Number=Sing _ _ _ SpaceAfter=No
11
+ 9 , , PUNCT _ _ _ _ _ _
12
+ 10 28 28 NUM _ NumType=Card _ _ _ _
13
+ 11 de de ADP _ _ _ _ _ _
14
+ 12 fevereiro fevereiro NOUN _ Gender=Masc|Number=Sing _ _ _ SpaceAfter=No
15
+ 13 . . PUNCT _ _ _ _ _ _
16
+ 14 http://t.co/oAHK5pB3e0 http://t.co/oAHK5pB3e0 SYM _ _ _ _ _ SpaceAfter=No
17
+
18
+ # sent_id = FOLHA_DOC000123_SENT016
19
+ # text = O Capitão América também bajulou o tucano.
20
+ 1 O o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art _ _ _ _
21
+ 2 Capitão Capitão PROPN _ _ _ _ _ _
22
+ 3 América América PROPN _ _ _ _ _ _
23
+ 4 também também ADV _ _ _ _ _ _
24
+ 5 bajulou bajular VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin _ _ _ _
25
+ 6 o o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art _ _ _ _
26
+ 7 tucano tucano NOUN _ Gender=Masc|Number=Sing _ _ _ SpaceAfter=No
27
+ 8 . . PUNCT _ _ _ _ _ SpaceAfter=No
28
+
29
+ # sent_id = 119-20141209-TESEMSC_0-9
30
+ # text = Atualmente, a bentonita sódica é a argila comercial mais utilizada em fluidos de perfuração (Amorim, 2003).
31
+ 1 Atualmente atualmente ADV _ _ _ _ _ SpaceAfter=No
32
+ 2 , , PUNCT _ _ _ _ _ _
33
+ 3 a o DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art _ _ _ _
34
+ 4 bentonita bentonita NOUN _ Gender=Fem|Number=Sing _ _ _ _
35
+ 5 sódica sódico ADJ _ Gender=Fem|Number=Sing _ _ _ _
36
+ 6 é ser AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin _ _ _ _
37
+ 7 a o DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art _ _ _ _
38
+ 8 argila argila NOUN _ Gender=Fem|Number=Sing _ _ _ _
39
+ 9 comercial comercial ADJ _ Gender=Fem|Number=Sing _ _ _ _
40
+ 10 mais mais ADV _ _ _ _ _ _
41
+ 11 utilizada utilizar VERB _ Gender=Fem|Number=Sing|VerbForm=Part _ _ _ _
42
+ 12 em em ADP _ _ _ _ _ _
43
+ 13 fluidos fluido NOUN _ Gender=Masc|Number=Plur _ _ _ _
44
+ 14 de de ADP _ _ _ _ _ _
45
+ 15 perfuração perfuração NOUN _ Gender=Fem|Number=Sing _ _ _ _
46
+ 16 ( ( PUNCT _ _ _ _ _ SpaceAfter=No
47
+ 17 Amorim Amorim PROPN _ Gender=Masc|Number=Sing _ _ _ SpaceAfter=No
48
+ 18 , , PUNCT _ _ _ _ _ _
49
+ 19 2003 2003 NUM _ NumType=Card _ _ _ SpaceAfter=No
50
+ 20 ) ) PUNCT _ _ _ _ _ SpaceAfter=No
51
+ 21 . . PUNCT _ _ _ _ _ _
52
+
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask>=1.1.2
2
+ gunicorn>=20.0.4
3
+ conllu>=4.4.2
4
+
5
+ # Note:
6
+ # Ensure that `libffi` is installed on your system, as it is required for compatibility with PyTorch.
7
+ # Installation instructions by operating system:
8
+ #
9
+ # - **Ubuntu/Debian**: Run `sudo apt install libffi-dev`
10
+ # - **CentOS/RHEL**: Run `sudo yum install libffi-devel`
11
+ # - **macOS**: Install via Homebrew with `brew install libffi`
12
+ # - **Windows**: `libffi` is included in Windows Python distributions, so no extra installation is typically needed.
13
+ #
14
+ # On WSL (Windows Subsystem for Linux), use the same command as for Linux distributions.
15
+ # If you encounter issues, check if `libffi.so.6` exists. If missing, create a symbolic link to the installed version:
16
+ # `sudo ln -s /usr/lib/x86_64-linux-gnu/libffi.so.7 /usr/lib/x86_64-linux-gnu/libffi.so.6`
run_parser.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import stanza
2
+ from stanza.utils.conll import CoNLL
3
+ import sys
4
+ import os
5
+
6
+ def main(input_file):
7
+ # Directory and model paths
8
+ model_dir = os.path.join('models')
9
+ model_path = os.path.join(model_dir, 'genipapo.pt')
10
+
11
+ # Check if the model file exists
12
+ if not os.path.exists(model_path):
13
+ print("Genipapo model not found. Please run 'download_model.py' first to download the model.")
14
+ return
15
+
16
+ # Initialize the Stanza pipeline with the custom dependency parser model
17
+ nlp = stanza.Pipeline(
18
+ lang='pt',
19
+ processors='depparse',
20
+ depparse_pretagged=True, # Assumes the input file has POS tags already
21
+ depparse_model_path=model_path,
22
+ tokenize_pretokenized=True, # Assumes tokens are already split in .conllu format
23
+ use_gpu=False,
24
+ download_method=None
25
+ )
26
+
27
+ # Process each sentence in the input CoNLL-U file
28
+ doc = CoNLL.conll2doc(input_file=input_file)
29
+ parsed_doc = nlp(doc)
30
+
31
+ # Update original document with parsed dependency information
32
+ for orig_sentence, parsed_sentence in zip(doc.sentences, parsed_doc.sentences):
33
+ for orig_word, parsed_word in zip(orig_sentence.words, parsed_sentence.words):
34
+ orig_word.head = parsed_word.head
35
+ orig_word.deprel = parsed_word.deprel
36
+
37
+ # Save the updated document in CoNLL-U format
38
+ output_file = 'output.conllu'
39
+ with open(output_file, 'w', encoding='utf-8') as f:
40
+ f.write("{:C}".format(doc))
41
+ f.write('\n''\n')
42
+
43
+ print(f"Updated CONLLU file saved to '{output_file}'")
44
+
45
+ if __name__ == '__main__':
46
+ if len(sys.argv) != 2:
47
+ print("Usage: python run_parser.py path/to/your_file.conllu")
48
+ else:
49
+ input_file = sys.argv[1]
50
+ if not os.path.exists(input_file):
51
+ print(f"Input file {input_file} does not exist.")
52
+ else:
53
+ main(input_file)
startup.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gunicorn --bind=0.0.0.0 --timeout 1200 app:app
static/geni.jpg ADDED

Git LFS Details

  • SHA256: bbaa899ed6998f49f7a05f0b34ae07d2fc7f863ce7016777b4c68a4c52d0838b
  • Pointer size: 132 Bytes
  • Size of remote file: 1.17 MB
templates/about.html ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>About - Genipapo Parser</title>
7
+ <style>
8
+ body {
9
+ font-family: Arial, sans-serif;
10
+ margin: 0;
11
+ padding: 0;
12
+ background: url('/static/geni.jpg') no-repeat center center fixed;
13
+ background-size: cover;
14
+ background-color: rgba(0, 0, 0, 0.65);
15
+ background-blend-mode: overlay;
16
+ line-height: 1.6;
17
+ padding: 20px;
18
+ }
19
+
20
+ .header {
21
+ width: 100%;
22
+ background: rgba(3, 112, 49, 0.8);
23
+ padding: 10px 0;
24
+ position: fixed;
25
+ top: 0;
26
+ left: 0;
27
+ z-index: 1000;
28
+ }
29
+
30
+ .header nav {
31
+ display: flex;
32
+ justify-content: center;
33
+ gap: 1em;
34
+ }
35
+
36
+ .header a {
37
+ text-decoration: none;
38
+ color: white;
39
+ padding: 10px 15px;
40
+ border-radius: 5px;
41
+ transition: background-color 0.3s;
42
+ font-size: 1em;
43
+ }
44
+
45
+ .header a:hover, .header a.active {
46
+ background-color: rgba(4, 63, 28, 0.8);
47
+ }
48
+
49
+ .content {
50
+ margin: 80px auto 0; /* Espaço abaixo do header */
51
+ max-width: 800px;
52
+ background-color: rgba(255, 255, 255, 0.8);
53
+ border-radius: 8px;
54
+ padding: 20px;
55
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
56
+ }
57
+
58
+ h1 {
59
+ text-align: center;
60
+ font-size: 2.5em;
61
+ margin-bottom: 20px;
62
+ }
63
+
64
+ p {
65
+ margin-bottom: 1em;
66
+ }
67
+
68
+ ul {
69
+ list-style: disc;
70
+ margin: 10px 0 20px 20px;
71
+ }
72
+
73
+ li {
74
+ margin-bottom: 10px;
75
+ }
76
+
77
+ a {
78
+ color: #2946c5;
79
+ text-decoration: underline;
80
+ transition: color 0.3s;
81
+ }
82
+
83
+ a:hover {
84
+ color: #5573f8;
85
+ }
86
+ </style>
87
+ </head>
88
+ <body>
89
+ <header class="header">
90
+ <nav>
91
+ <a href="/">Genipapo</a>
92
+ <a href="about" class="active">About</a>
93
+ <a href="api_guide">API Guide</a>
94
+ <a href="contact">Contact Us</a>
95
+ </nav>
96
+ </header>
97
+
98
+ <div class="content">
99
+ <h1>Genipapo Parser</h1>
100
+ <p>
101
+ Genipapo is a multigenre dependency parser specifically tailored for Brazilian Portuguese, developed in alignment with the Universal Dependencies (UD) framework. Trained using three distinct gold-standard corpora - including journalistic texts, academic papers in the oil and gas domain, and user-generated content from X posts (formerly Twitter) - Genipapo delivers robust syntactic analysis across diverse text genres. Achieving a Labelled Attachment Score (LAS) exceeding 94%, it outperforms or matches the performance of single-genre parsers, making it a versatile tool for use in Natural Language Processing applications.
102
+ </p>
103
+ <h2>Acknowledgments</h2>
104
+ <ul>
105
+ <li>
106
+ This work was carried out at the Center for Artificial Intelligence of the University of São Paulo (<a href="http://c4ai.inova.usp.br/">C4AI</a>), with support by the São Paulo Research Foundation (FAPESP grant #2019/07665-4) and by the IBM Corporation. The project was also supported by the Ministry of Science, Technology and Innovation, with resources of Law N. 8.248, of October 23, 1991, within the scope of PPI-SOFTEX, coordinated by Softex and published as Residence in TIC 13, DOU 01245.010222/2022-44.
107
+ </li>
108
+ <li>
109
+ Genipapo was developed using the <a href="https://stanfordnlp.github.io/stanza/">Stanza</a> library. We thank the Stanford NLP Group for providing this tool for the NLP community.
110
+ </li>
111
+ </ul>
112
+ <h2>How to cite</h2>
113
+ <p>
114
+ Di Felippo, A.; Roman, N.T.; Barbosa, B.K.S.; Pardo, T.A.S. (2024). Genipapo - a Multigenre Dependency Parsing for Brazilian Portuguese. In the Proceedings of the 15th Symposium in Information and Human Language Technology (STIL). November, 17-21. Belém-PA, Brazil. p. 257-266. DOI: <a href=https://doi.org/10.5753/stil.2024.245415>https://doi.org/10.5753/stil.2024.245415</a>
115
+ </p>
116
+ </div>
117
+ </body>
118
+ </html>
templates/api_guide.html ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>API Guide - Genipapo Parser</title>
7
+ <style>
8
+ body {
9
+ font-family: Arial, sans-serif;
10
+ margin: 0;
11
+ padding: 0;
12
+ background: url('/static/geni.jpg') no-repeat center center fixed;
13
+ background-size: cover;
14
+ background-color: rgba(0, 0, 0, 0.65);
15
+ background-blend-mode: overlay;
16
+ line-height: 1.6;
17
+ padding: 20px;
18
+ }
19
+
20
+ .header {
21
+ width: 100%;
22
+ background: rgba(3, 112, 49, 0.8);
23
+ padding: 10px 0;
24
+ position: fixed;
25
+ top: 0;
26
+ left: 0;
27
+ z-index: 1000;
28
+ }
29
+
30
+ .header nav {
31
+ display: flex;
32
+ justify-content: center;
33
+ gap: 1em;
34
+ }
35
+
36
+ .header a {
37
+ text-decoration: none;
38
+ color: white;
39
+ padding: 10px 15px;
40
+ border-radius: 5px;
41
+ transition: background-color 0.3s;
42
+ font-size: 1em;
43
+ }
44
+
45
+ .header a:hover, .header a.active {
46
+ background-color: rgba(4, 63, 28, 0.8);
47
+ }
48
+
49
+ .content {
50
+ margin: 80px auto 0;
51
+ max-width: 800px;
52
+ background-color: rgba(255, 255, 255, 0.8);
53
+ border-radius: 8px;
54
+ padding: 20px;
55
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
56
+ }
57
+
58
+ h1 {
59
+ text-align: center;
60
+ font-size: 2.5em;
61
+ margin-bottom: 20px;
62
+ }
63
+
64
+ p, pre {
65
+ margin-bottom: 1em;
66
+ }
67
+
68
+ pre {
69
+ white-space: pre-wrap;
70
+ word-wrap: break-word;
71
+ overflow-wrap: break-word;
72
+ padding: 10px;
73
+ background: #f9f9f9;
74
+ border: 1px solid #ccc;
75
+ border-radius: 4px;
76
+ overflow-x: auto; /* Horizontal scrolling only if necessary */
77
+ }
78
+
79
+ ul {
80
+ list-style: disc;
81
+ margin: 10px 0 20px 20px;
82
+ }
83
+
84
+ li {
85
+ margin-bottom: 10px;
86
+ }
87
+
88
+ a {
89
+ color: #2946c5;
90
+ text-decoration: underline;
91
+ transition: color 0.3s;
92
+ }
93
+
94
+ a:hover {
95
+ color: #5573f8;
96
+ }
97
+
98
+ /* Inline code */
99
+ code {
100
+ background: #f4f4f4;
101
+ border: 1px solid #ddd;
102
+ padding: 2px 5px;
103
+ border-radius: 3px;
104
+ font-family: monospace;
105
+ white-space: normal; /* Inline allows breaking into lines */
106
+ word-wrap: break-word;
107
+ overflow-wrap: break-word;
108
+ display: inline; /* Ensures inline behavior */
109
+ }
110
+
111
+ /* Block code (inside <pre>) */
112
+ pre code {
113
+ background: #f9f9f9;
114
+ border: 1px solid #ccc;
115
+ padding: 10px;
116
+ border-radius: 4px;
117
+ font-family: monospace;
118
+ white-space: pre-wrap; /* Preserves formatting while allowing line breaks */
119
+ word-wrap: break-word;
120
+ overflow-wrap: break-word;
121
+ display: block; /* Ensures block behavior */
122
+ overflow-x: auto; /* Horizontal scrolling only if necessary */
123
+ }
124
+
125
+ </style>
126
+ </head>
127
+ <body>
128
+ <header class="header">
129
+ <nav>
130
+ <a href="/">Genipapo</a>
131
+ <a href="about">About</a>
132
+ <a href="api_guide" class="active">API Guide</a>
133
+ <a href="contact">Contact Us</a>
134
+ </nav>
135
+ </header>
136
+
137
+ <div class="content">
138
+ <h1>Genipapo API Guide</h1>
139
+ <p>
140
+ This guide provides instructions on how to use the Genipapo Parser API for processing
141
+ Brazilian Portuguese text in CoNLL-U format.
142
+ </p>
143
+ <p>
144
+ All the examples provided in this guide were extracted from the <strong>Porttinari Base</strong> corpus,
145
+ part of the <a href="https://sites.google.com/icmc.usp.br/poetisa/porttinari-2-0" target="_blank">Poetisa project</a>.
146
+ </p>
147
+
148
+ <h2>Endpoints</h2>
149
+ <ul>
150
+ <li><strong>POST /api/process</strong> - Process a <code>.conllu</code> file.</li>
151
+ <li><strong>POST /api/process/json</strong> - Process raw <code>.conllu</code> content in JSON format.</li>
152
+ </ul>
153
+
154
+ <h2>1. Process a File</h2>
155
+ <p>
156
+ Use the <code>/api/process</code> endpoint to upload a <code>.conllu</code> file. The endpoint accepts the following parameter:
157
+ </p>
158
+ <ul>
159
+ <li><strong>response_format</strong> (optional): Set to <code>json</code> to return processed content as JSON. Defaults to <code>file</code>.</li>
160
+ </ul>
161
+
162
+ <h3>1.1 Example: Returning a File</h3>
163
+ <p>
164
+ When <code>response_format</code> is set to <code>file</code>, the processed content is returned as a downloadable
165
+ <code>.conllu</code> file. Specify the output filename using <code>--output</code>.
166
+ </p>
167
+ <pre><code>curl -X POST -H "Content-Type: multipart/form-data" \
168
+ -F "file=@example.conllu" \
169
+ "https://genipapo-parser.azurewebsites.net/api/process?response_format=file" \
170
+ --output processed_example.conllu</code></pre>
171
+
172
+ <h3>1.2 Example: Returning JSON</h3>
173
+ <p>
174
+ When <code>response_format</code> is set to <code>json</code>, the processed content is returned in JSON format.
175
+ </p>
176
+ <pre><code>curl -X POST -H "Content-Type: multipart/form-data" \
177
+ -F "file=@example.conllu" \
178
+ "https://genipapo-parser.azurewebsites.net/api/process?response_format=json"</code></pre>
179
+
180
+ <h4>Example JSON Response:</h4>
181
+ <pre><code>{
182
+ "status": "success",
183
+ "warnings": [],
184
+ "processed_content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capitão América também bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tdet\t_\t_\n2\tCapitão\tCapitão\tPROPN\t_\t_\t5\tnsubj\t_\t_\n3\tAmérica\tAmérica\tPROPN\t_\t_\t2\tflat:name\t_\t_\n4\ttambém\ttambém\tADV\t_\t_\t5\tadvmod\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t7\tdet\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t5\tobj\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t5\tpunct\t_\tSpaceAfter=No\n"
185
+ }</code></pre>
186
+
187
+ <h2>2. Process Raw Content</h2>
188
+ <p>
189
+ Use the <code>/api/process/json</code> endpoint to send raw CoNLL-U content as JSON. Include the content
190
+ in the <code>content</code> field of the JSON body.
191
+ </p>
192
+ <pre><code>curl -X POST -H "Content-Type: application/json" \
193
+ -d '{"content": "# sent_id = FOLHA_DOC000123_SENT016
194
+ # text = O Capitão América também bajulou o tucano.
195
+ 1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t_\t_\t_\t_
196
+ 2\tCapitão\tCapitão\tPROPN\t_\t_\t_\t_\t_\t_
197
+ 3\tAmérica\tAmérica\tPROPN\t_\t_\t_\t_\t_\t_
198
+ 4\ttambém\ttambém\tADV\t_\t_\t_\t_\t_\t_
199
+ 5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t_\t_\t_\t_
200
+ 6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t_\t_\t_\t_
201
+ 7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t_\t_\t_\tSpaceAfter=No
202
+ 8\t.\t.\tPUNCT\t_\t_\t_\t_\t_\tSpaceAfter=No"}' \
203
+ "http://localhost:8000/api/process/json"</code></pre>
204
+
205
+ <h4>Example JSON Response:</h4>
206
+ <pre><code>{
207
+ "status": "success",
208
+ "warnings": [],
209
+ "processed_content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capitão América também bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tdet\t_\t_\n2\tCapitão\tCapitão\tPROPN\t_\t_\t5\tnsubj\t_\t_\n3\tAmérica\tAmérica\tPROPN\t_\t_\t2\tflat:name\t_\t_\n4\ttambém\ttambém\tADV\t_\t_\t5\tadvmod\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t7\tdet\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t5\tobj\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t5\tpunct\t_\tSpaceAfter=No\n"
210
+ }</code></pre>
211
+
212
+ <h3>Example with Input and Output</h3>
213
+
214
+ <h4>Original Input</h4>
215
+ <pre><code># sent_id = FOLHA_DOC000123_SENT016
216
+ # text = O Capitão América também bajulou o tucano.
217
+ 1 O o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art _ _ _ _
218
+ 2 Capitão Capitão PROPN _ _ _ _ _ _
219
+ 3 América América PROPN _ _ _ _ _ _
220
+ 4 também também ADV _ _ _ _ _ _
221
+ 5 bajulou bajular VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin _ _ _ _
222
+ 6 o o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art _ _ _ _
223
+ 7 tucano tucano NOUN _ Gender=Masc|Number=Sing _ _ _ SpaceAfter=No
224
+ 8 . . PUNCT _ _ _ _ _ SpaceAfter=No</code></pre>
225
+
226
+ <h4>Processed Output</h4>
227
+ <pre><code># sent_id = FOLHA_DOC000123_SENT016
228
+ # text = O Capitão América também bajulou o tucano.
229
+ 1 O o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 2 det _ _
230
+ 2 Capitão Capitão PROPN _ _ 5 nsubj _ _
231
+ 3 América América PROPN _ _ 2 flat:name _ _
232
+ 4 também também ADV _ _ 5 advmod _ _
233
+ 5 bajulou bajular VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 0 root _ _
234
+ 6 o o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 7 det _ _
235
+ 7 tucano tucano NOUN _ Gender=Masc|Number=Sing 5 obj _ SpaceAfter=No
236
+ 8 . . PUNCT _ _ 5 punct _ SpaceAfter=No</code></pre>
237
+
238
+ <h2>Contact</h2>
239
+ <p>
240
+ For further assistance, please <a href="https://genipapo-parser.azurewebsites.net/contact">contact us</a>.
241
+ </p>
242
+ </div>
243
+
templates/contact.html ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Contact Us - Genipapo Parser</title>
7
+ <style>
8
+ body {
9
+ font-family: Arial, sans-serif;
10
+ margin: 0;
11
+ padding: 0;
12
+ background: url('/static/geni.jpg') no-repeat center center fixed;
13
+ background-size: cover;
14
+ background-color: rgba(0, 0, 0, 0.65);
15
+ background-blend-mode: overlay;
16
+ line-height: 1.6;
17
+ padding: 20px;
18
+ }
19
+
20
+ .header {
21
+ width: 100%;
22
+ background: rgba(3, 112, 49, 0.8);
23
+ padding: 10px 0;
24
+ position: fixed;
25
+ top: 0;
26
+ left: 0;
27
+ z-index: 1000;
28
+ }
29
+
30
+ .header nav {
31
+ display: flex;
32
+ justify-content: center;
33
+ gap: 1em;
34
+ }
35
+
36
+ .header a {
37
+ text-decoration: none;
38
+ color: white;
39
+ padding: 10px 15px;
40
+ border-radius: 5px;
41
+ transition: background-color 0.3s;
42
+ font-size: 1em;
43
+ }
44
+
45
+ .header a:hover, .header a.active {
46
+ background-color: rgba(4, 63, 28, 0.8);
47
+ }
48
+
49
+ .content {
50
+ margin: 80px auto 0; /* Espaço abaixo do header */
51
+ max-width: 600px;
52
+ background-color: rgba(255, 255, 255, 0.8);
53
+ border-radius: 8px;
54
+ padding: 20px;
55
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
56
+ }
57
+
58
+ h1 {
59
+ text-align: center;
60
+ font-size: 2.5em;
61
+ margin-bottom: 20px;
62
+ }
63
+
64
+ p {
65
+ margin-bottom: 1em;
66
+ }
67
+
68
+ a {
69
+ color: #2946c5;
70
+ text-decoration: underline;
71
+ transition: color 0.3s;
72
+ }
73
+
74
+ a:hover {
75
+ color: #5573f8;
76
+ }
77
+
78
+ .email, .github-link {
79
+ text-align: center;
80
+ font-size: 1.2em;
81
+ margin: 15px 0;
82
+ }
83
+ </style>
84
+ </head>
85
+ <body>
86
+ <header class="header">
87
+ <nav>
88
+ <a href="/">Genipapo</a>
89
+ <a href="about">About</a>
90
+ <a href="api_guide">API Guide</a>
91
+ <a href="contact" class="active">Contact Us</a>
92
+ </nav>
93
+ </header>
94
+
95
+ <div class="content">
96
+ <h1>Contact Us</h1>
97
+ <p class="email">
98
+ For inquiries, you can reach us at:
99
+ <a href="mailto:bryankhelven@ieee.org">bryankhelven@ieee.org</a>
100
+ </p>
101
+ <p class="github-link">
102
+ If you'd like to run the parser locally, visit our GitHub repository: <br>
103
+ <a href="https://github.com/bryankhelven/genipapo" target="_blank">Genipapo repository</a>
104
+ </p>
105
+ </div>
106
+ </body>
107
+ </html>
templates/upload_conllu.html ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <header class="header">
4
+ <nav>
5
+ <a href="/" class="active">Genipapo</a>
6
+ <a href="about">About</a>
7
+ <a href="api_guide">API Guide</a>
8
+ <a href="contact">Contact Us</a>
9
+ </nav>
10
+ </header>
11
+ <head>
12
+ <meta charset="UTF-8">
13
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
14
+ <title>Genipapo Parser</title>
15
+ <style>
16
+ body {
17
+ display: flex;
18
+ justify-content: center;
19
+ align-items: center;
20
+ height: 100vh;
21
+ font-family: Arial, sans-serif;
22
+ margin: 0;
23
+ background: url('/static/geni.jpg') no-repeat center center fixed;
24
+ background-size: cover;
25
+ background-color: rgba(0, 0, 0, 0.65); /* Optional overlay for translucency */
26
+ background-blend-mode: overlay;
27
+ }
28
+ #upload-container {
29
+ text-align: center;
30
+ background-color: rgba(255, 255, 255, 0.8); /* White background with 80% opacity */
31
+ padding: 2em;
32
+ box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1);
33
+ border-radius: 8px;
34
+ max-width: 500px;
35
+ }
36
+ #button-container {
37
+ display: flex;
38
+ justify-content: center;
39
+ align-items: center;
40
+ gap: 1em; /* Space between buttons */
41
+ flex-wrap: wrap; /* Allow buttons to wrap if needed */
42
+ }
43
+ .custom-file-label {
44
+ display: inline-block;
45
+ background-color: #007bff;
46
+ color: white;
47
+ padding: 0.5em 1em;
48
+ border-radius: 4px;
49
+ cursor: pointer;
50
+ text-align: center;
51
+ max-width: 200px; /* Limit width for long file names */
52
+ white-space: nowrap;
53
+ overflow: hidden;
54
+ text-overflow: ellipsis; /* Add "..." for long text */
55
+ }
56
+ .custom-file-label:hover {
57
+ background-color: #0056b3;
58
+ }
59
+ #submitBtn {
60
+ padding: 0.5em 2em;
61
+ background-color: #28a745;
62
+ color: white;
63
+ border: none;
64
+ cursor: pointer;
65
+ border-radius: 4px;
66
+ }
67
+ #submitBtn:disabled {
68
+ background-color: #999;
69
+ }
70
+ #message {
71
+ margin-top: 1em;
72
+ }
73
+ h1 {
74
+ font-size: 2em;
75
+ margin-bottom: 1em;
76
+ }
77
+ p {
78
+ font-size: 1.2em;
79
+ margin-bottom: 1em;
80
+ }
81
+ .header {
82
+ width: 100%;
83
+ background: rgba(3, 112, 49, 0.8); /* Azul similar ao botão, 80% opacidade */
84
+ padding: 10px 0;
85
+ position: fixed;
86
+ top: 0;
87
+ left: 0;
88
+ z-index: 1000;
89
+ }
90
+
91
+ .header nav {
92
+ display: flex;
93
+ justify-content: center; /* Centraliza os botões no meio */
94
+ gap: 1em; /* Espaço entre os botões */
95
+ }
96
+
97
+ .header a {
98
+ text-decoration: none;
99
+ color: white;
100
+ padding: 10px 15px;
101
+ border-radius: 5px;
102
+ transition: background-color 0.3s;
103
+ font-size: 1em; /* Tamanho consistente */
104
+ }
105
+
106
+ .header a:hover, .header a.active {
107
+ background-color: rgba(4, 63, 28, 0.8); /* Tom mais escuro no hover/seleção */
108
+ }
109
+ </style>
110
+ </head>
111
+ <body>
112
+ <div id="upload-container">
113
+ <h1>Upload a conllu File</h1>
114
+ <p>Please upload a conllu file that is properly structured and already has POS tags.</p>
115
+
116
+ <div id="button-container">
117
+ <!-- Custom file upload button -->
118
+ <label for="fileInput" class="custom-file-label">Choose File</label>
119
+ <input type="file" id="fileInput" accept=".conllu" hidden>
120
+
121
+ <!-- Submit button -->
122
+ <button id="submitBtn" onclick="uploadFile()">Upload and Process</button>
123
+ </div>
124
+
125
+ <div id="message"></div>
126
+ </div>
127
+
128
+ <script>
129
+ const fileInput = document.getElementById('fileInput');
130
+ const customFileLabel = document.querySelector('.custom-file-label');
131
+
132
+ // Update the custom button text when a file is selected
133
+ fileInput.addEventListener('change', function () {
134
+ if (fileInput.files.length > 0) {
135
+ customFileLabel.textContent = fileInput.files[0].name;
136
+ }
137
+ });
138
+ </script>
139
+ </body>
140
+
141
+
142
+
143
+ <script>
144
+ async function uploadFile() {
145
+ const fileInput = document.getElementById('fileInput');
146
+ const messageDiv = document.getElementById('message');
147
+ const submitBtn = document.getElementById('submitBtn');
148
+
149
+ if (!fileInput.files.length) {
150
+ messageDiv.textContent = 'Please select a .conllu file.';
151
+ return;
152
+ }
153
+
154
+ const file = fileInput.files[0];
155
+ if (!file.name.endsWith('.conllu')) {
156
+ messageDiv.textContent = 'Only .conllu files are allowed.';
157
+ return;
158
+ }
159
+
160
+ messageDiv.textContent = 'Checking file...';
161
+ submitBtn.disabled = true;
162
+
163
+ // Prepare the file for upload
164
+ const formData = new FormData();
165
+ formData.append('file', file);
166
+
167
+ try {
168
+ // Display progress message
169
+ messageDiv.textContent = 'File is being processed...';
170
+
171
+ const response = await fetch('/', {
172
+ method: 'POST',
173
+ body: formData
174
+ });
175
+
176
+ if (response.ok) {
177
+ // Get warnings from headers
178
+ const warningsHeader = response.headers.get('X-Warnings');
179
+ if (warningsHeader) {
180
+ // URL-decode the warnings string
181
+ const warningsDecoded = decodeURIComponent(warningsHeader);
182
+ // Split the warnings back into an array
183
+ const warningsArray = warningsDecoded.split('\n');
184
+ // Display warnings to the user
185
+ const warningsHtml = warningsArray.map(warning => `<div>${warning}</div>`).join('');
186
+ messageDiv.innerHTML = `<div>Warnings:</div>${warningsHtml}`;
187
+ } else {
188
+ messageDiv.textContent = 'File processed successfully.';
189
+ }
190
+
191
+ // Handle the file download
192
+ const blob = await response.blob();
193
+ const downloadUrl = URL.createObjectURL(blob);
194
+ const a = document.createElement('a');
195
+ a.href = downloadUrl;
196
+ a.download = `${file.name.replace('.conllu', '')}_parsed.conllu`;
197
+ document.body.appendChild(a);
198
+ a.click();
199
+ a.remove();
200
+ } else {
201
+ // Read the error message from the response body
202
+ const responseText = await response.text();
203
+ // Display the error message
204
+ messageDiv.innerHTML = `Error: ${responseText}`;
205
+ }
206
+ } catch (error) {
207
+ messageDiv.textContent = `Error: ${error.message}`;
208
+ } finally {
209
+ submitBtn.disabled = false;
210
+ }
211
+ }
212
+ </script>
213
+
214
+
215
+ </body>
216
+ </html>