Spaces:
Sleeping
Sleeping
Bryan Khelven
commited on
Commit
·
ffdedc7
1
Parent(s):
3e3bf83
Initial deploy
Browse files- .gitattributes +11 -33
- Dockerfile +24 -0
- README.md +171 -8
- app.py +270 -0
- download_resources.py +92 -0
- example.conllu +52 -0
- requirements.txt +16 -0
- run_parser.py +53 -0
- startup.txt +1 -0
- static/geni.jpg +3 -0
- templates/about.html +118 -0
- templates/api_guide.html +243 -0
- templates/contact.html +107 -0
- templates/upload_conllu.html +216 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,13 @@
|
|
| 1 |
-
|
| 2 |
-
*.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# LFS-managed binary types
|
| 2 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 7 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
|
| 12 |
+
# Treat everything else as text (auto-normalize line endings)
|
| 13 |
+
* text=auto
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use a PyTorch base image with torch already installed
|
| 2 |
+
FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime
|
| 3 |
+
|
| 4 |
+
# Install Stanza
|
| 5 |
+
RUN pip install stanza>=1.2
|
| 6 |
+
|
| 7 |
+
#Define workdir
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
# Copy dependency files and install
|
| 11 |
+
COPY requirements.txt /app/requirements.txt
|
| 12 |
+
RUN pip install -r requirements.txt
|
| 13 |
+
|
| 14 |
+
# Copie os Stanza resources
|
| 15 |
+
COPY stanza_resources /root/stanza_resources
|
| 16 |
+
|
| 17 |
+
# Copy the static folder (including jeni.jpg)
|
| 18 |
+
COPY static /app/static
|
| 19 |
+
|
| 20 |
+
# Copy the remain code
|
| 21 |
+
COPY . /app
|
| 22 |
+
|
| 23 |
+
# Configure the start command
|
| 24 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:${PORT}", "--timeout", "1200", "app:app"]
|
README.md
CHANGED
|
@@ -1,11 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Genipapo Web
|
| 2 |
+
|
| 3 |
+
**Genipapo Web** is a lightweight web-based interface for the **Genipapo Parser**, enabling users to validate and process `.conllu` files directly in their browser. This repository simplifies the deployment of the Genipapo Parser's web version using Docker.
|
| 4 |
+
|
| 5 |
---
|
| 6 |
+
|
| 7 |
+
## Purpose
|
| 8 |
+
|
| 9 |
+
This project provides an accessible interface for the **Genipapo Parser**, allowing users to:
|
| 10 |
+
|
| 11 |
+
- **Validate and Parse** `.conllu` files directly in their web browser.
|
| 12 |
+
- **Easily Deploy** the parser via Docker, without requiring a complex local setup.
|
| 13 |
+
- **Build a local API version** of the parser, allowing local requisitions in a faster manner.
|
| 14 |
+
|
| 15 |
+
For details on the **Genipapo Parser** itself, visit the main repository:
|
| 16 |
+
|
| 17 |
+
[Genipapo Parser GitHub Repository](https://github.com/bryankhelven/genipapo)
|
| 18 |
+
|
| 19 |
---
|
| 20 |
|
| 21 |
+
## Features
|
| 22 |
+
|
| 23 |
+
1. **Web-Based Interface**:
|
| 24 |
+
- Upload `.conllu` files for validation and parsing.
|
| 25 |
+
- Download parsed files with updated dependency relations.
|
| 26 |
+
- View warnings and errors for `.conllu` file validation.
|
| 27 |
+
|
| 28 |
+
2. **Dockerized Deployment**:
|
| 29 |
+
- Simplified setup with a single Docker command.
|
| 30 |
+
- No local installation of dependencies required.
|
| 31 |
+
|
| 32 |
+
3. **Reference to Genipapo Parser**:
|
| 33 |
+
- Built on the Genipapo Parser, a multigenre dependency parser for Brazilian Portuguese.
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Prerequisites
|
| 38 |
+
|
| 39 |
+
- **Docker**: Ensure Docker is installed on your system. [Download Docker](https://www.docker.com/products/docker-desktop)
|
| 40 |
+
- **Python 3.7+** (only needed to prepare resources before building the Docker image)
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## Installation and Setup
|
| 45 |
+
|
| 46 |
+
### 1. Clone the Repository
|
| 47 |
+
|
| 48 |
+
```bash
|
| 49 |
+
git clone https://github.com/bryankhelven/genipapo_web.git
|
| 50 |
+
cd genipapo_web
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### 2. Download Resources
|
| 54 |
+
|
| 55 |
+
Run the following script to download the necessary resources and models:
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
python download_resources.py
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
This will place the resources and model files in their respective folders:
|
| 62 |
+
- `stanza_resources/`
|
| 63 |
+
- `models/`
|
| 64 |
+
|
| 65 |
+
### 3. Build the Docker Image
|
| 66 |
+
|
| 67 |
+
Build the Docker image using the following command:
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
docker build -t genipapo-web .
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
### 4. Run the Docker Container
|
| 74 |
+
|
| 75 |
+
Run the container and expose the application on port `8000`:
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
docker run -it -p 8000:8000 genipapo-web
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
### 5. Access the Application
|
| 82 |
+
|
| 83 |
+
Open your browser and navigate to:
|
| 84 |
+
|
| 85 |
+
```text
|
| 86 |
+
http://localhost:8000/
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
## API Usage
|
| 92 |
+
|
| 93 |
+
### Endpoints
|
| 94 |
+
|
| 95 |
+
- **POST /api/process** - Process a `.conllu` file.
|
| 96 |
+
- **POST /api/process/json** - Process raw `.conllu` content in JSON format.
|
| 97 |
+
|
| 98 |
+
### 1. Process a File
|
| 99 |
+
|
| 100 |
+
Use the `/api/process` endpoint to upload a `.conllu` file.
|
| 101 |
+
|
| 102 |
+
#### Parameters:
|
| 103 |
+
|
| 104 |
+
- **response_format** (optional): Set to `json` to return processed content as JSON. Defaults to `file`.
|
| 105 |
+
|
| 106 |
+
#### Example: Returning a File
|
| 107 |
+
|
| 108 |
+
When `response_format` is set to `file`, the processed content is returned as a downloadable `.conllu` file.
|
| 109 |
+
|
| 110 |
+
```bash
|
| 111 |
+
curl -X POST -H "Content-Type: multipart/form-data" \
|
| 112 |
+
-F "file=@example.conllu" \
|
| 113 |
+
"http://localhost:8000/api/process?response_format=file" \
|
| 114 |
+
--output processed_example.conllu
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
#### Example: Returning JSON
|
| 118 |
+
|
| 119 |
+
When `response_format` is set to `json`, the processed content is returned in JSON format.
|
| 120 |
+
|
| 121 |
+
```bash
|
| 122 |
+
curl -X POST -H "Content-Type: multipart/form-data" \
|
| 123 |
+
-F "file=@example.conllu" \
|
| 124 |
+
"http://localhost:8000/api/process?response_format=json"
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
Example JSON Response:
|
| 128 |
+
|
| 129 |
+
```json
|
| 130 |
+
{
|
| 131 |
+
"status": "success",
|
| 132 |
+
"warnings": [],
|
| 133 |
+
"processed_content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capit\u00e3o Am\u00e9rica tamb\u00e9m bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tdet\t_\t_\n2\tCapit\u00e3o\tCapit\u00e3o\tPROPN\t_\t_\t5\tnsubj\t_\t_\n3\tAm\u00e9rica\tAm\u00e9rica\tPROPN\t_\t_\t2\tflat:name\t_\t_\n4\ttamb\u00e9m\ttamb\u00e9m\tADV\t_\t_\t5\tadvmod\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t7\tdet\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t5\tobj\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t5\tpunct\t_\tSpaceAfter=No\n"
|
| 134 |
+
}
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
### 2. Process Raw Content
|
| 138 |
+
|
| 139 |
+
Use the `/api/process/json` endpoint to send raw CoNLL-U content as JSON.
|
| 140 |
+
|
| 141 |
+
#### Example:
|
| 142 |
+
|
| 143 |
+
```bash
|
| 144 |
+
curl -X POST -H "Content-Type: application/json" \
|
| 145 |
+
-d '{"content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capit\u00e3o Am\u00e9rica tamb\u00e9m bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t_\t_\t_\t_\n2\tCapit\u00e3o\tCapit\u00e3o\tPROPN\t_\t_\t_\t_\t_\t_\n3\tAm\u00e9rica\tAm\u00e9rica\tPROPN\t_\t_\t_\t_\t_\t_\n4\ttamb\u00e9m\ttamb\u00e9m\tADV\t_\t_\t_\t_\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t_\t_\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t_\t_\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t_\t_\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t_\t_\t_\tSpaceAfter=No"}' \
|
| 146 |
+
"http://localhost:8000/api/process/json"
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
Example JSON Response:
|
| 150 |
+
|
| 151 |
+
```json
|
| 152 |
+
{
|
| 153 |
+
"status": "success",
|
| 154 |
+
"warnings": [],
|
| 155 |
+
"processed_content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capit\u00e3o Am\u00e9rica tamb\u00e9m bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tdet\t_\t_\n2\tCapit\u00e3o\tCapit\u00e3o\tPROPN\t_\t_\t5\tnsubj\t_\t_\n3\tAm\u00e9rica\tAm\u00e9rica\tPROPN\t_\t_\t2\tflat:name\t_\t_\n4\ttamb\u00e9m\ttamb\u00e9m\tADV\t_\t_\t5\tadvmod\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t7\tdet\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t5\tobj\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t5\tpunct\t_\tSpaceAfter=No\n"
|
| 156 |
+
}
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
---
|
| 160 |
+
|
| 161 |
+
## Acknowledgments
|
| 162 |
+
|
| 163 |
+
- This work was carried out at the [Center for Artificial Intelligence of the University of São Paulo (C4AI)](http://c4ai.inova.usp.br/), supported by the São Paulo Research Foundation (FAPESP grant #2019/07665-4) and the IBM Corporation.
|
| 164 |
+
- The project was supported by the Ministry of Science, Technology and Innovation, with resources of Law N. 8.248, of October 23, 1991, within the scope of PPI-SOFTEX, coordinated by Softex and published as Residence in TIC 13, DOU 01245.010222/2022-44.
|
| 165 |
+
- **Genipapo** was developed using the [Stanza library](https://stanfordnlp.github.io/stanza/), courtesy of the Stanford NLP Group.
|
| 166 |
+
|
| 167 |
+
---
|
| 168 |
+
|
| 169 |
+
## Contact
|
| 170 |
+
|
| 171 |
+
For inquiries, suggestions, or bug reports, reach out to:
|
| 172 |
+
|
| 173 |
+
- **Email**: [bryankhelven@ieee.org](mailto:bryankhelven@ieee.org)
|
| 174 |
+
- **Main Parser Repository**: [Genipapo Parser](https://github.com/bryankhelven/genipapo)
|
app.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import urllib.parse
|
| 2 |
+
from flask import Flask, request, send_file, render_template, make_response, jsonify
|
| 3 |
+
import stanza
|
| 4 |
+
from stanza.utils.conll import CoNLL
|
| 5 |
+
from conllu import parse_incr
|
| 6 |
+
import os
|
| 7 |
+
import tempfile
|
| 8 |
+
from io import StringIO
|
| 9 |
+
|
| 10 |
+
app = Flask(__name__)
|
| 11 |
+
|
| 12 |
+
# Ensure the templates folder is correctly configured
|
| 13 |
+
app.template_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates')
|
| 14 |
+
|
| 15 |
+
# Define the model directory and path
|
| 16 |
+
model_dir = os.path.join('models')
|
| 17 |
+
model_path = os.path.join(model_dir, 'genipapo.pt')
|
| 18 |
+
|
| 19 |
+
# Initialize the Stanza pipeline once for reuse
|
| 20 |
+
nlp = stanza.Pipeline(
|
| 21 |
+
lang='pt',
|
| 22 |
+
processors='depparse',
|
| 23 |
+
depparse_pretagged=True,
|
| 24 |
+
depparse_model_path=model_path,
|
| 25 |
+
tokenize_pretokenized=True,
|
| 26 |
+
use_gpu=False,
|
| 27 |
+
download_method=None
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
def validate_conllu_file(content):
|
| 31 |
+
"""
|
| 32 |
+
Validate the .conllu file format and ensure:
|
| 33 |
+
1. Each token line has 10 columns.
|
| 34 |
+
2. POS tags (UPOS) are present and valid.
|
| 35 |
+
Forms or lemmas that are "_" issue warnings but do not prevent processing.
|
| 36 |
+
"""
|
| 37 |
+
errors = []
|
| 38 |
+
warnings = []
|
| 39 |
+
valid_pos_tags = set([
|
| 40 |
+
# Common universal POS tags
|
| 41 |
+
"ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN",
|
| 42 |
+
"NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
|
| 43 |
+
])
|
| 44 |
+
|
| 45 |
+
lines = content.strip().split('\n')
|
| 46 |
+
line_iter = iter(enumerate(lines, start=1))
|
| 47 |
+
sentence_num = 0
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
for sentence in parse_incr(StringIO(content)):
|
| 51 |
+
sentence_num += 1
|
| 52 |
+
for token in sentence:
|
| 53 |
+
# Find the line corresponding to the current token
|
| 54 |
+
while True:
|
| 55 |
+
try:
|
| 56 |
+
line_num, line = next(line_iter)
|
| 57 |
+
except StopIteration:
|
| 58 |
+
raise Exception("Unexpected end of content while parsing tokens.")
|
| 59 |
+
line = line.strip()
|
| 60 |
+
if line == '' or line.startswith('#'):
|
| 61 |
+
continue # Skip empty lines and comments
|
| 62 |
+
else:
|
| 63 |
+
break # Found the token line
|
| 64 |
+
columns = line.split('\t')
|
| 65 |
+
if len(columns) != 10:
|
| 66 |
+
errors.append(f"Line {line_num} of the conllu file: Incorrect number of columns ({len(columns)} found, 10 required).")
|
| 67 |
+
continue # Skip further checks for this token
|
| 68 |
+
|
| 69 |
+
if isinstance(token['id'], int): # Process only word tokens
|
| 70 |
+
token_id = token['id']
|
| 71 |
+
form = token.get('form', '').strip()
|
| 72 |
+
lemma = token.get('lemma', '').strip()
|
| 73 |
+
upos = token.get('upos', '').strip().upper()
|
| 74 |
+
|
| 75 |
+
if upos == '_':
|
| 76 |
+
errors.append(f"Line {line_num}: Missing POS tag (UPOS).")
|
| 77 |
+
elif upos not in valid_pos_tags:
|
| 78 |
+
errors.append(f"Error on line {line_num} of the conllu file: Invalid POS tag '{upos}'.")
|
| 79 |
+
|
| 80 |
+
if form == "_":
|
| 81 |
+
warnings.append(f"Warning on line {line_num} of the conllu file: Form is empty")
|
| 82 |
+
if lemma == "_":
|
| 83 |
+
warnings.append(f"Warning on line {line_num} of the conllu file: Lemma is empty")
|
| 84 |
+
except Exception as e:
|
| 85 |
+
errors.append(f"Parsing error: {str(e)}")
|
| 86 |
+
return False, errors, warnings
|
| 87 |
+
|
| 88 |
+
if errors:
|
| 89 |
+
return False, errors, warnings
|
| 90 |
+
else:
|
| 91 |
+
return True, [], warnings
|
| 92 |
+
|
| 93 |
+
# Main route for file upload
|
| 94 |
+
@app.route('/', methods=['GET', 'POST'])
|
| 95 |
+
def upload_file():
|
| 96 |
+
if request.method == 'POST':
|
| 97 |
+
# Check if the file is present in the request
|
| 98 |
+
if 'file' not in request.files:
|
| 99 |
+
return 'No file found in the request.', 400
|
| 100 |
+
file = request.files['file']
|
| 101 |
+
if file.filename == '':
|
| 102 |
+
return 'No file selected.', 400
|
| 103 |
+
if file and file.filename.endswith('.conllu'):
|
| 104 |
+
# Read the file content
|
| 105 |
+
content = file.read().decode('utf-8')
|
| 106 |
+
|
| 107 |
+
# Validate the .conllu file
|
| 108 |
+
is_valid, errors, warnings = validate_conllu_file(content)
|
| 109 |
+
if not is_valid:
|
| 110 |
+
# Return validation errors and stop processing
|
| 111 |
+
error_message = "Validation failed:<br>" + "<br>".join(errors)
|
| 112 |
+
return error_message, 400, {'Content-Type': 'text/html'}
|
| 113 |
+
else:
|
| 114 |
+
# Optionally, display warnings to the user
|
| 115 |
+
if warnings:
|
| 116 |
+
warning_message = "Warnings:<br>" + "<br>".join(warnings)
|
| 117 |
+
# You can choose to display warnings or log them
|
| 118 |
+
print(warning_message) # Or handle as needed
|
| 119 |
+
|
| 120 |
+
# Save the valid file and process it
|
| 121 |
+
input_temp_path = save_temp_file(content)
|
| 122 |
+
|
| 123 |
+
# Process the file after validation
|
| 124 |
+
try:
|
| 125 |
+
output_file_path = process_file(input_temp_path, file.filename)
|
| 126 |
+
|
| 127 |
+
# Create a response object to include headers
|
| 128 |
+
response = make_response(send_file(output_file_path, as_attachment=True))
|
| 129 |
+
|
| 130 |
+
# Include warnings in the response headers if any
|
| 131 |
+
if warnings:
|
| 132 |
+
# Join warnings into a single string
|
| 133 |
+
warnings_str = '\n'.join(warnings)
|
| 134 |
+
# URL-encode the warnings string to safely include in the header
|
| 135 |
+
warnings_encoded = urllib.parse.quote(warnings_str)
|
| 136 |
+
# Include warnings in a custom header
|
| 137 |
+
response.headers['X-Warnings'] = warnings_encoded
|
| 138 |
+
return response
|
| 139 |
+
except Exception as e:
|
| 140 |
+
# Handle unexpected errors in processing
|
| 141 |
+
return f"Error during processing: {str(e)}", 500
|
| 142 |
+
else:
|
| 143 |
+
return 'Invalid file type. Only .conllu files are allowed.', 400
|
| 144 |
+
|
| 145 |
+
# Render the HTML template
|
| 146 |
+
return render_template('upload_conllu.html')
|
| 147 |
+
|
| 148 |
+
def save_temp_file(content):
|
| 149 |
+
"""
|
| 150 |
+
Save the content to a temporary file and return its path.
|
| 151 |
+
"""
|
| 152 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.conllu', mode='w', encoding='utf-8') as input_temp:
|
| 153 |
+
input_temp.write(content)
|
| 154 |
+
return input_temp.name
|
| 155 |
+
|
| 156 |
+
def process_file(input_file_path, original_filename):
|
| 157 |
+
"""
|
| 158 |
+
Process the .conllu file using the Stanza pipeline.
|
| 159 |
+
"""
|
| 160 |
+
doc = CoNLL.conll2doc(input_file=input_file_path)
|
| 161 |
+
parsed_doc = nlp(doc)
|
| 162 |
+
|
| 163 |
+
for orig_sentence, parsed_sentence in zip(doc.sentences, parsed_doc.sentences):
|
| 164 |
+
for orig_word, parsed_word in zip(orig_sentence.words, parsed_sentence.words):
|
| 165 |
+
orig_word.head = parsed_word.head
|
| 166 |
+
orig_word.deprel = parsed_word.deprel
|
| 167 |
+
|
| 168 |
+
base_name = os.path.splitext(original_filename)[0]
|
| 169 |
+
output_filename = base_name + '_parsed.conllu'
|
| 170 |
+
output_file_path = os.path.join(tempfile.gettempdir(), output_filename)
|
| 171 |
+
|
| 172 |
+
with open(output_file_path, 'w', encoding='utf-8') as f:
|
| 173 |
+
f.write("{:C}".format(doc))
|
| 174 |
+
f.write('\n\n')
|
| 175 |
+
|
| 176 |
+
return output_file_path
|
| 177 |
+
|
| 178 |
+
@app.route('/api/process', methods=['POST'])
|
| 179 |
+
def process_api():
|
| 180 |
+
response_format = request.args.get('response_format', 'file')
|
| 181 |
+
|
| 182 |
+
if 'file' not in request.files:
|
| 183 |
+
return jsonify({'error': 'No file part in the request'}), 400
|
| 184 |
+
|
| 185 |
+
file = request.files['file']
|
| 186 |
+
if file.filename == '':
|
| 187 |
+
return jsonify({'error': 'No file selected'}), 400
|
| 188 |
+
|
| 189 |
+
if not file.filename.endswith('.conllu'):
|
| 190 |
+
return jsonify({'error': 'Invalid file type. Only .conllu files are allowed.'}), 400
|
| 191 |
+
|
| 192 |
+
content = file.read().decode('utf-8')
|
| 193 |
+
is_valid, errors, warnings = validate_conllu_file(content)
|
| 194 |
+
if not is_valid:
|
| 195 |
+
return jsonify({'status': 'error', 'errors': errors, 'warnings': warnings}), 400
|
| 196 |
+
|
| 197 |
+
# Save the valid file and process it
|
| 198 |
+
input_temp_path = save_temp_file(content)
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
output_file_path = process_file(input_temp_path, file.filename)
|
| 202 |
+
|
| 203 |
+
if response_format == 'json':
|
| 204 |
+
# Read the processed content from the file
|
| 205 |
+
with open(output_file_path, 'r', encoding='utf-8') as processed_file:
|
| 206 |
+
output_content = processed_file.read()
|
| 207 |
+
|
| 208 |
+
return jsonify({
|
| 209 |
+
'status': 'success',
|
| 210 |
+
'warnings': warnings,
|
| 211 |
+
'processed_content': output_content
|
| 212 |
+
}), 200
|
| 213 |
+
else:
|
| 214 |
+
# Return the processed file directly
|
| 215 |
+
response = send_file(output_file_path, as_attachment=True, download_name='processed.conllu')
|
| 216 |
+
if warnings:
|
| 217 |
+
warnings_str = '\n'.join(warnings)
|
| 218 |
+
response.headers['X-Warnings'] = urllib.parse.quote(warnings_str)
|
| 219 |
+
return response
|
| 220 |
+
except Exception as e:
|
| 221 |
+
return jsonify({'status': 'error', 'message': str(e)}), 500
|
| 222 |
+
|
| 223 |
+
@app.route('/about')
|
| 224 |
+
def about():
|
| 225 |
+
return render_template('about.html')
|
| 226 |
+
|
| 227 |
+
@app.route('/api/process/json', methods=['POST'])
|
| 228 |
+
def process_api_json():
|
| 229 |
+
# Check if the request body contains JSON
|
| 230 |
+
if not request.is_json:
|
| 231 |
+
return jsonify({'error': 'Request body must be JSON'}), 400
|
| 232 |
+
|
| 233 |
+
data = request.get_json()
|
| 234 |
+
content = data.get('content')
|
| 235 |
+
if not content:
|
| 236 |
+
return jsonify({'error': 'JSON must include a "content" field with .conllu data'}), 400
|
| 237 |
+
|
| 238 |
+
# Validate the .conllu content
|
| 239 |
+
is_valid, errors, warnings = validate_conllu_file(content)
|
| 240 |
+
if not is_valid:
|
| 241 |
+
return jsonify({'status': 'error', 'errors': errors, 'warnings': warnings}), 400
|
| 242 |
+
|
| 243 |
+
try:
|
| 244 |
+
# Save the valid content to a temporary file
|
| 245 |
+
input_temp_path = save_temp_file(content)
|
| 246 |
+
output_file_path = process_file(input_temp_path, "input.conllu")
|
| 247 |
+
|
| 248 |
+
# Read the processed content from the file
|
| 249 |
+
with open(output_file_path, 'r', encoding='utf-8') as processed_file:
|
| 250 |
+
output_content = processed_file.read()
|
| 251 |
+
|
| 252 |
+
return jsonify({
|
| 253 |
+
'status': 'success',
|
| 254 |
+
'warnings': warnings,
|
| 255 |
+
'processed_content': output_content
|
| 256 |
+
}), 200
|
| 257 |
+
except Exception as e:
|
| 258 |
+
return jsonify({'status': 'error', 'message': str(e)}), 500
|
| 259 |
+
|
| 260 |
+
@app.route('/contact')
|
| 261 |
+
def contact():
|
| 262 |
+
return render_template('contact.html')
|
| 263 |
+
|
| 264 |
+
@app.route('/api_guide')
|
| 265 |
+
def api_guide():
|
| 266 |
+
return render_template('api_guide.html')
|
| 267 |
+
|
| 268 |
+
if __name__ == '__main__':
|
| 269 |
+
# Run the app on port 8000
|
| 270 |
+
app.run(host='0.0.0.0', port=8000)
|
download_resources.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import hashlib
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
def download_genipapo_model():
|
| 7 |
+
# Direct download URL from GitHub Releases
|
| 8 |
+
model_url = 'https://github.com/bryankhelven/genipapo/releases/download/Publishing/genipapo.pt'
|
| 9 |
+
model_dir = os.path.join('models')
|
| 10 |
+
model_path = os.path.join(model_dir, 'genipapo.pt')
|
| 11 |
+
|
| 12 |
+
if not os.path.exists(model_dir):
|
| 13 |
+
os.makedirs(model_dir)
|
| 14 |
+
|
| 15 |
+
if os.path.exists(model_path):
|
| 16 |
+
print("Genipapo model already exists. Verifying checksum...")
|
| 17 |
+
with open(model_path, 'rb') as f:
|
| 18 |
+
data = f.read()
|
| 19 |
+
checksum = hashlib.md5(data).hexdigest()
|
| 20 |
+
if checksum == model_checksum:
|
| 21 |
+
print("Checksum verified. Model is ready to use.")
|
| 22 |
+
return
|
| 23 |
+
else:
|
| 24 |
+
print("Checksum mismatch. Redownloading the model...")
|
| 25 |
+
os.remove(model_path)
|
| 26 |
+
|
| 27 |
+
print("Downloading Genipapo model...")
|
| 28 |
+
response = requests.get(model_url, stream=True)
|
| 29 |
+
if response.status_code != 200:
|
| 30 |
+
print("Failed to download the model. Please check the URL.")
|
| 31 |
+
sys.exit(1)
|
| 32 |
+
with open(model_path, 'wb') as f:
|
| 33 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 34 |
+
if chunk:
|
| 35 |
+
f.write(chunk)
|
| 36 |
+
|
| 37 |
+
print("Download completed. Model is ready to use.")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# Diretório onde os recursos serão salvos
|
| 41 |
+
RESOURCE_DIR = "stanza_resources"
|
| 42 |
+
LANGUAGE = "pt"
|
| 43 |
+
|
| 44 |
+
# Mapear os componentes necessários com os URLs corrigidos
|
| 45 |
+
REQUIRED_COMPONENTS = {
|
| 46 |
+
"backward_charlm": "https://huggingface.co/stanfordnlp/stanza-pt/resolve/main/models/backward_charlm/oscar2023.pt",
|
| 47 |
+
"forward_charlm": "https://huggingface.co/stanfordnlp/stanza-pt/resolve/main/models/forward_charlm/oscar2023.pt",
|
| 48 |
+
"pretrain": "https://huggingface.co/stanfordnlp/stanza-pt/resolve/main/models/pretrain/conll17.pt",
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Função para baixar arquivos com progresso
|
| 52 |
+
def download_file(url, dest_path):
|
| 53 |
+
with requests.get(url, stream=True) as response:
|
| 54 |
+
response.raise_for_status()
|
| 55 |
+
with open(dest_path, "wb") as file:
|
| 56 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 57 |
+
if chunk:
|
| 58 |
+
file.write(chunk)
|
| 59 |
+
|
| 60 |
+
# Função para baixar recursos específicos
|
| 61 |
+
def download_specific_resources():
|
| 62 |
+
if not os.path.exists(RESOURCE_DIR):
|
| 63 |
+
os.makedirs(RESOURCE_DIR)
|
| 64 |
+
|
| 65 |
+
# Baixar o arquivo `resources.json`
|
| 66 |
+
resources_url = "https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json"
|
| 67 |
+
resources_path = os.path.join(RESOURCE_DIR, "resources.json")
|
| 68 |
+
print("Baixando resources.json...")
|
| 69 |
+
download_file(resources_url, resources_path)
|
| 70 |
+
|
| 71 |
+
# Caminho base para os recursos do idioma
|
| 72 |
+
lang_dir = os.path.join(RESOURCE_DIR, LANGUAGE)
|
| 73 |
+
if not os.path.exists(lang_dir):
|
| 74 |
+
os.makedirs(lang_dir)
|
| 75 |
+
|
| 76 |
+
# Baixar os componentes necessários
|
| 77 |
+
for component, url in REQUIRED_COMPONENTS.items():
|
| 78 |
+
component_dir = os.path.join(lang_dir, component)
|
| 79 |
+
os.makedirs(component_dir, exist_ok=True)
|
| 80 |
+
component_path = os.path.join(component_dir, "model.pt")
|
| 81 |
+
print(f"Baixando {component}...")
|
| 82 |
+
download_file(url, component_path)
|
| 83 |
+
print(f"{component} baixado para {component_path}")
|
| 84 |
+
|
| 85 |
+
print("Download concluído. Recursos disponíveis em:", RESOURCE_DIR)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
if __name__ == '__main__':
|
| 90 |
+
download_genipapo_model()
|
| 91 |
+
download_specific_resources()
|
| 92 |
+
|
example.conllu
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# sent_id = dante_01_441020223408578560l
|
| 2 |
+
# text = #PETR4 - Análise #Ichimoku - pregão de sexta-feira, 28 de fevereiro. http://t.co/oAHK5pB3e0
|
| 3 |
+
1 #PETR4 #PETR4 PROPN _ _ _ _ _ _
|
| 4 |
+
2 - - PUNCT _ _ _ _ _ _
|
| 5 |
+
3 Análise Análise PROPN _ _ _ _ _ _
|
| 6 |
+
4 #Ichimoku #Ichimoku PROPN _ _ _ _ _ _
|
| 7 |
+
5 - - PUNCT _ _ _ _ _ _
|
| 8 |
+
6 pregão pregão NOUN _ Gender=Masc|Number=Sing _ _ _ _
|
| 9 |
+
7 de de ADP _ _ _ _ _ _
|
| 10 |
+
8 sexta-feira sexta-feira NOUN _ Gender=Fem|Number=Sing _ _ _ SpaceAfter=No
|
| 11 |
+
9 , , PUNCT _ _ _ _ _ _
|
| 12 |
+
10 28 28 NUM _ NumType=Card _ _ _ _
|
| 13 |
+
11 de de ADP _ _ _ _ _ _
|
| 14 |
+
12 fevereiro fevereiro NOUN _ Gender=Masc|Number=Sing _ _ _ SpaceAfter=No
|
| 15 |
+
13 . . PUNCT _ _ _ _ _ _
|
| 16 |
+
14 http://t.co/oAHK5pB3e0 http://t.co/oAHK5pB3e0 SYM _ _ _ _ _ SpaceAfter=No
|
| 17 |
+
|
| 18 |
+
# sent_id = FOLHA_DOC000123_SENT016
|
| 19 |
+
# text = O Capitão América também bajulou o tucano.
|
| 20 |
+
1 O o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art _ _ _ _
|
| 21 |
+
2 Capitão Capitão PROPN _ _ _ _ _ _
|
| 22 |
+
3 América América PROPN _ _ _ _ _ _
|
| 23 |
+
4 também também ADV _ _ _ _ _ _
|
| 24 |
+
5 bajulou bajular VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin _ _ _ _
|
| 25 |
+
6 o o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art _ _ _ _
|
| 26 |
+
7 tucano tucano NOUN _ Gender=Masc|Number=Sing _ _ _ SpaceAfter=No
|
| 27 |
+
8 . . PUNCT _ _ _ _ _ SpaceAfter=No
|
| 28 |
+
|
| 29 |
+
# sent_id = 119-20141209-TESEMSC_0-9
|
| 30 |
+
# text = Atualmente, a bentonita sódica é a argila comercial mais utilizada em fluidos de perfuração (Amorim, 2003).
|
| 31 |
+
1 Atualmente atualmente ADV _ _ _ _ _ SpaceAfter=No
|
| 32 |
+
2 , , PUNCT _ _ _ _ _ _
|
| 33 |
+
3 a o DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art _ _ _ _
|
| 34 |
+
4 bentonita bentonita NOUN _ Gender=Fem|Number=Sing _ _ _ _
|
| 35 |
+
5 sódica sódico ADJ _ Gender=Fem|Number=Sing _ _ _ _
|
| 36 |
+
6 é ser AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin _ _ _ _
|
| 37 |
+
7 a o DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art _ _ _ _
|
| 38 |
+
8 argila argila NOUN _ Gender=Fem|Number=Sing _ _ _ _
|
| 39 |
+
9 comercial comercial ADJ _ Gender=Fem|Number=Sing _ _ _ _
|
| 40 |
+
10 mais mais ADV _ _ _ _ _ _
|
| 41 |
+
11 utilizada utilizar VERB _ Gender=Fem|Number=Sing|VerbForm=Part _ _ _ _
|
| 42 |
+
12 em em ADP _ _ _ _ _ _
|
| 43 |
+
13 fluidos fluido NOUN _ Gender=Masc|Number=Plur _ _ _ _
|
| 44 |
+
14 de de ADP _ _ _ _ _ _
|
| 45 |
+
15 perfuração perfuração NOUN _ Gender=Fem|Number=Sing _ _ _ _
|
| 46 |
+
16 ( ( PUNCT _ _ _ _ _ SpaceAfter=No
|
| 47 |
+
17 Amorim Amorim PROPN _ Gender=Masc|Number=Sing _ _ _ SpaceAfter=No
|
| 48 |
+
18 , , PUNCT _ _ _ _ _ _
|
| 49 |
+
19 2003 2003 NUM _ NumType=Card _ _ _ SpaceAfter=No
|
| 50 |
+
20 ) ) PUNCT _ _ _ _ _ SpaceAfter=No
|
| 51 |
+
21 . . PUNCT _ _ _ _ _ _
|
| 52 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Flask>=1.1.2
|
| 2 |
+
gunicorn>=20.0.4
|
| 3 |
+
conllu>=4.4.2
|
| 4 |
+
|
| 5 |
+
# Note:
|
| 6 |
+
# Ensure that `libffi` is installed on your system, as it is required for compatibility with PyTorch.
|
| 7 |
+
# Installation instructions by operating system:
|
| 8 |
+
#
|
| 9 |
+
# - **Ubuntu/Debian**: Run `sudo apt install libffi-dev`
|
| 10 |
+
# - **CentOS/RHEL**: Run `sudo yum install libffi-devel`
|
| 11 |
+
# - **macOS**: Install via Homebrew with `brew install libffi`
|
| 12 |
+
# - **Windows**: `libffi` is included in Windows Python distributions, so no extra installation is typically needed.
|
| 13 |
+
#
|
| 14 |
+
# On WSL (Windows Subsystem for Linux), use the same command as for Linux distributions.
|
| 15 |
+
# If you encounter issues, check if `libffi.so.6` exists. If missing, create a symbolic link to the installed version:
|
| 16 |
+
# `sudo ln -s /usr/lib/x86_64-linux-gnu/libffi.so.7 /usr/lib/x86_64-linux-gnu/libffi.so.6`
|
run_parser.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import stanza
|
| 2 |
+
from stanza.utils.conll import CoNLL
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
def main(input_file):
|
| 7 |
+
# Directory and model paths
|
| 8 |
+
model_dir = os.path.join('models')
|
| 9 |
+
model_path = os.path.join(model_dir, 'genipapo.pt')
|
| 10 |
+
|
| 11 |
+
# Check if the model file exists
|
| 12 |
+
if not os.path.exists(model_path):
|
| 13 |
+
print("Genipapo model not found. Please run 'download_model.py' first to download the model.")
|
| 14 |
+
return
|
| 15 |
+
|
| 16 |
+
# Initialize the Stanza pipeline with the custom dependency parser model
|
| 17 |
+
nlp = stanza.Pipeline(
|
| 18 |
+
lang='pt',
|
| 19 |
+
processors='depparse',
|
| 20 |
+
depparse_pretagged=True, # Assumes the input file has POS tags already
|
| 21 |
+
depparse_model_path=model_path,
|
| 22 |
+
tokenize_pretokenized=True, # Assumes tokens are already split in .conllu format
|
| 23 |
+
use_gpu=False,
|
| 24 |
+
download_method=None
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Process each sentence in the input CoNLL-U file
|
| 28 |
+
doc = CoNLL.conll2doc(input_file=input_file)
|
| 29 |
+
parsed_doc = nlp(doc)
|
| 30 |
+
|
| 31 |
+
# Update original document with parsed dependency information
|
| 32 |
+
for orig_sentence, parsed_sentence in zip(doc.sentences, parsed_doc.sentences):
|
| 33 |
+
for orig_word, parsed_word in zip(orig_sentence.words, parsed_sentence.words):
|
| 34 |
+
orig_word.head = parsed_word.head
|
| 35 |
+
orig_word.deprel = parsed_word.deprel
|
| 36 |
+
|
| 37 |
+
# Save the updated document in CoNLL-U format
|
| 38 |
+
output_file = 'output.conllu'
|
| 39 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 40 |
+
f.write("{:C}".format(doc))
|
| 41 |
+
f.write('\n''\n')
|
| 42 |
+
|
| 43 |
+
print(f"Updated CONLLU file saved to '{output_file}'")
|
| 44 |
+
|
| 45 |
+
if __name__ == '__main__':
|
| 46 |
+
if len(sys.argv) != 2:
|
| 47 |
+
print("Usage: python run_parser.py path/to/your_file.conllu")
|
| 48 |
+
else:
|
| 49 |
+
input_file = sys.argv[1]
|
| 50 |
+
if not os.path.exists(input_file):
|
| 51 |
+
print(f"Input file {input_file} does not exist.")
|
| 52 |
+
else:
|
| 53 |
+
main(input_file)
|
startup.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
gunicorn --bind=0.0.0.0 --timeout 1200 app:app
|
static/geni.jpg
ADDED
|
Git LFS Details
|
templates/about.html
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>About - Genipapo Parser</title>
|
| 7 |
+
<style>
|
| 8 |
+
body {
|
| 9 |
+
font-family: Arial, sans-serif;
|
| 10 |
+
margin: 0;
|
| 11 |
+
padding: 0;
|
| 12 |
+
background: url('/static/geni.jpg') no-repeat center center fixed;
|
| 13 |
+
background-size: cover;
|
| 14 |
+
background-color: rgba(0, 0, 0, 0.65);
|
| 15 |
+
background-blend-mode: overlay;
|
| 16 |
+
line-height: 1.6;
|
| 17 |
+
padding: 20px;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
.header {
|
| 21 |
+
width: 100%;
|
| 22 |
+
background: rgba(3, 112, 49, 0.8);
|
| 23 |
+
padding: 10px 0;
|
| 24 |
+
position: fixed;
|
| 25 |
+
top: 0;
|
| 26 |
+
left: 0;
|
| 27 |
+
z-index: 1000;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.header nav {
|
| 31 |
+
display: flex;
|
| 32 |
+
justify-content: center;
|
| 33 |
+
gap: 1em;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.header a {
|
| 37 |
+
text-decoration: none;
|
| 38 |
+
color: white;
|
| 39 |
+
padding: 10px 15px;
|
| 40 |
+
border-radius: 5px;
|
| 41 |
+
transition: background-color 0.3s;
|
| 42 |
+
font-size: 1em;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.header a:hover, .header a.active {
|
| 46 |
+
background-color: rgba(4, 63, 28, 0.8);
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.content {
|
| 50 |
+
margin: 80px auto 0; /* Espaço abaixo do header */
|
| 51 |
+
max-width: 800px;
|
| 52 |
+
background-color: rgba(255, 255, 255, 0.8);
|
| 53 |
+
border-radius: 8px;
|
| 54 |
+
padding: 20px;
|
| 55 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
h1 {
|
| 59 |
+
text-align: center;
|
| 60 |
+
font-size: 2.5em;
|
| 61 |
+
margin-bottom: 20px;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
p {
|
| 65 |
+
margin-bottom: 1em;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
ul {
|
| 69 |
+
list-style: disc;
|
| 70 |
+
margin: 10px 0 20px 20px;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
li {
|
| 74 |
+
margin-bottom: 10px;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
a {
|
| 78 |
+
color: #2946c5;
|
| 79 |
+
text-decoration: underline;
|
| 80 |
+
transition: color 0.3s;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
a:hover {
|
| 84 |
+
color: #5573f8;
|
| 85 |
+
}
|
| 86 |
+
</style>
|
| 87 |
+
</head>
|
| 88 |
+
<body>
|
| 89 |
+
<header class="header">
|
| 90 |
+
<nav>
|
| 91 |
+
<a href="/">Genipapo</a>
|
| 92 |
+
<a href="about" class="active">About</a>
|
| 93 |
+
<a href="api_guide">API Guide</a>
|
| 94 |
+
<a href="contact">Contact Us</a>
|
| 95 |
+
</nav>
|
| 96 |
+
</header>
|
| 97 |
+
|
| 98 |
+
<div class="content">
|
| 99 |
+
<h1>Genipapo Parser</h1>
|
| 100 |
+
<p>
|
| 101 |
+
Genipapo is a multigenre dependency parser specifically tailored for Brazilian Portuguese, developed in alignment with the Universal Dependencies (UD) framework. Trained using three distinct gold-standard corpora - including journalistic texts, academic papers in the oil and gas domain, and user-generated content from X posts (formerly Twitter) - Genipapo delivers robust syntactic analysis across diverse text genres. Achieving a Labelled Attachment Score (LAS) exceeding 94%, it outperforms or matches the performance of single-genre parsers, making it a versatile tool for use in Natural Language Processing applications.
|
| 102 |
+
</p>
|
| 103 |
+
<h2>Acknowledgments</h2>
|
| 104 |
+
<ul>
|
| 105 |
+
<li>
|
| 106 |
+
This work was carried out at the Center for Artificial Intelligence of the University of São Paulo (<a href="http://c4ai.inova.usp.br/">C4AI</a>), with support by the São Paulo Research Foundation (FAPESP grant #2019/07665-4) and by the IBM Corporation. The project was also supported by the Ministry of Science, Technology and Innovation, with resources of Law N. 8.248, of October 23, 1991, within the scope of PPI-SOFTEX, coordinated by Softex and published as Residence in TIC 13, DOU 01245.010222/2022-44.
|
| 107 |
+
</li>
|
| 108 |
+
<li>
|
| 109 |
+
Genipapo was developed using the <a href="https://stanfordnlp.github.io/stanza/">Stanza</a> library. We thank the Stanford NLP Group for providing this tool for the NLP community.
|
| 110 |
+
</li>
|
| 111 |
+
</ul>
|
| 112 |
+
<h2>How to cite</h2>
|
| 113 |
+
<p>
|
| 114 |
+
Di Felippo, A.; Roman, N.T.; Barbosa, B.K.S.; Pardo, T.A.S. (2024). Genipapo - a Multigenre Dependency Parsing for Brazilian Portuguese. In the Proceedings of the 15th Symposium in Information and Human Language Technology (STIL). November, 17-21. Belém-PA, Brazil. p. 257-266. DOI: <a href=https://doi.org/10.5753/stil.2024.245415>https://doi.org/10.5753/stil.2024.245415</a>
|
| 115 |
+
</p>
|
| 116 |
+
</div>
|
| 117 |
+
</body>
|
| 118 |
+
</html>
|
templates/api_guide.html
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>API Guide - Genipapo Parser</title>
|
| 7 |
+
<style>
|
| 8 |
+
body {
|
| 9 |
+
font-family: Arial, sans-serif;
|
| 10 |
+
margin: 0;
|
| 11 |
+
padding: 0;
|
| 12 |
+
background: url('/static/geni.jpg') no-repeat center center fixed;
|
| 13 |
+
background-size: cover;
|
| 14 |
+
background-color: rgba(0, 0, 0, 0.65);
|
| 15 |
+
background-blend-mode: overlay;
|
| 16 |
+
line-height: 1.6;
|
| 17 |
+
padding: 20px;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
.header {
|
| 21 |
+
width: 100%;
|
| 22 |
+
background: rgba(3, 112, 49, 0.8);
|
| 23 |
+
padding: 10px 0;
|
| 24 |
+
position: fixed;
|
| 25 |
+
top: 0;
|
| 26 |
+
left: 0;
|
| 27 |
+
z-index: 1000;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.header nav {
|
| 31 |
+
display: flex;
|
| 32 |
+
justify-content: center;
|
| 33 |
+
gap: 1em;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.header a {
|
| 37 |
+
text-decoration: none;
|
| 38 |
+
color: white;
|
| 39 |
+
padding: 10px 15px;
|
| 40 |
+
border-radius: 5px;
|
| 41 |
+
transition: background-color 0.3s;
|
| 42 |
+
font-size: 1em;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.header a:hover, .header a.active {
|
| 46 |
+
background-color: rgba(4, 63, 28, 0.8);
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.content {
|
| 50 |
+
margin: 80px auto 0;
|
| 51 |
+
max-width: 800px;
|
| 52 |
+
background-color: rgba(255, 255, 255, 0.8);
|
| 53 |
+
border-radius: 8px;
|
| 54 |
+
padding: 20px;
|
| 55 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
h1 {
|
| 59 |
+
text-align: center;
|
| 60 |
+
font-size: 2.5em;
|
| 61 |
+
margin-bottom: 20px;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
p, pre {
|
| 65 |
+
margin-bottom: 1em;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
pre {
|
| 69 |
+
white-space: pre-wrap;
|
| 70 |
+
word-wrap: break-word;
|
| 71 |
+
overflow-wrap: break-word;
|
| 72 |
+
padding: 10px;
|
| 73 |
+
background: #f9f9f9;
|
| 74 |
+
border: 1px solid #ccc;
|
| 75 |
+
border-radius: 4px;
|
| 76 |
+
overflow-x: auto; /* Horizontal scrolling only if necessary */
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
ul {
|
| 80 |
+
list-style: disc;
|
| 81 |
+
margin: 10px 0 20px 20px;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
li {
|
| 85 |
+
margin-bottom: 10px;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
a {
|
| 89 |
+
color: #2946c5;
|
| 90 |
+
text-decoration: underline;
|
| 91 |
+
transition: color 0.3s;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
a:hover {
|
| 95 |
+
color: #5573f8;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
/* Inline code */
|
| 99 |
+
code {
|
| 100 |
+
background: #f4f4f4;
|
| 101 |
+
border: 1px solid #ddd;
|
| 102 |
+
padding: 2px 5px;
|
| 103 |
+
border-radius: 3px;
|
| 104 |
+
font-family: monospace;
|
| 105 |
+
white-space: normal; /* Inline allows breaking into lines */
|
| 106 |
+
word-wrap: break-word;
|
| 107 |
+
overflow-wrap: break-word;
|
| 108 |
+
display: inline; /* Ensures inline behavior */
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
/* Block code (inside <pre>) */
|
| 112 |
+
pre code {
|
| 113 |
+
background: #f9f9f9;
|
| 114 |
+
border: 1px solid #ccc;
|
| 115 |
+
padding: 10px;
|
| 116 |
+
border-radius: 4px;
|
| 117 |
+
font-family: monospace;
|
| 118 |
+
white-space: pre-wrap; /* Preserves formatting while allowing line breaks */
|
| 119 |
+
word-wrap: break-word;
|
| 120 |
+
overflow-wrap: break-word;
|
| 121 |
+
display: block; /* Ensures block behavior */
|
| 122 |
+
overflow-x: auto; /* Horizontal scrolling only if necessary */
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
</style>
|
| 126 |
+
</head>
|
| 127 |
+
<body>
|
| 128 |
+
<header class="header">
|
| 129 |
+
<nav>
|
| 130 |
+
<a href="/">Genipapo</a>
|
| 131 |
+
<a href="about">About</a>
|
| 132 |
+
<a href="api_guide" class="active">API Guide</a>
|
| 133 |
+
<a href="contact">Contact Us</a>
|
| 134 |
+
</nav>
|
| 135 |
+
</header>
|
| 136 |
+
|
| 137 |
+
<div class="content">
|
| 138 |
+
<h1>Genipapo API Guide</h1>
|
| 139 |
+
<p>
|
| 140 |
+
This guide provides instructions on how to use the Genipapo Parser API for processing
|
| 141 |
+
Brazilian Portuguese text in CoNLL-U format.
|
| 142 |
+
</p>
|
| 143 |
+
<p>
|
| 144 |
+
All the examples provided in this guide were extracted from the <strong>Porttinari Base</strong> corpus,
|
| 145 |
+
part of the <a href="https://sites.google.com/icmc.usp.br/poetisa/porttinari-2-0" target="_blank">Poetisa project</a>.
|
| 146 |
+
</p>
|
| 147 |
+
|
| 148 |
+
<h2>Endpoints</h2>
|
| 149 |
+
<ul>
|
| 150 |
+
<li><strong>POST /api/process</strong> - Process a <code>.conllu</code> file.</li>
|
| 151 |
+
<li><strong>POST /api/process/json</strong> - Process raw <code>.conllu</code> content in JSON format.</li>
|
| 152 |
+
</ul>
|
| 153 |
+
|
| 154 |
+
<h2>1. Process a File</h2>
|
| 155 |
+
<p>
|
| 156 |
+
Use the <code>/api/process</code> endpoint to upload a <code>.conllu</code> file. The endpoint accepts the following parameter:
|
| 157 |
+
</p>
|
| 158 |
+
<ul>
|
| 159 |
+
<li><strong>response_format</strong> (optional): Set to <code>json</code> to return processed content as JSON. Defaults to <code>file</code>.</li>
|
| 160 |
+
</ul>
|
| 161 |
+
|
| 162 |
+
<h3>1.1 Example: Returning a File</h3>
|
| 163 |
+
<p>
|
| 164 |
+
When <code>response_format</code> is set to <code>file</code>, the processed content is returned as a downloadable
|
| 165 |
+
<code>.conllu</code> file. Specify the output filename using <code>--output</code>.
|
| 166 |
+
</p>
|
| 167 |
+
<pre><code>curl -X POST -H "Content-Type: multipart/form-data" \
|
| 168 |
+
-F "file=@example.conllu" \
|
| 169 |
+
"https://genipapo-parser.azurewebsites.net/api/process?response_format=file" \
|
| 170 |
+
--output processed_example.conllu</code></pre>
|
| 171 |
+
|
| 172 |
+
<h3>1.2 Example: Returning JSON</h3>
|
| 173 |
+
<p>
|
| 174 |
+
When <code>response_format</code> is set to <code>json</code>, the processed content is returned in JSON format.
|
| 175 |
+
</p>
|
| 176 |
+
<pre><code>curl -X POST -H "Content-Type: multipart/form-data" \
|
| 177 |
+
-F "file=@example.conllu" \
|
| 178 |
+
"https://genipapo-parser.azurewebsites.net/api/process?response_format=json"</code></pre>
|
| 179 |
+
|
| 180 |
+
<h4>Example JSON Response:</h4>
|
| 181 |
+
<pre><code>{
|
| 182 |
+
"status": "success",
|
| 183 |
+
"warnings": [],
|
| 184 |
+
"processed_content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capitão América também bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tdet\t_\t_\n2\tCapitão\tCapitão\tPROPN\t_\t_\t5\tnsubj\t_\t_\n3\tAmérica\tAmérica\tPROPN\t_\t_\t2\tflat:name\t_\t_\n4\ttambém\ttambém\tADV\t_\t_\t5\tadvmod\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t7\tdet\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t5\tobj\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t5\tpunct\t_\tSpaceAfter=No\n"
|
| 185 |
+
}</code></pre>
|
| 186 |
+
|
| 187 |
+
<h2>2. Process Raw Content</h2>
|
| 188 |
+
<p>
|
| 189 |
+
Use the <code>/api/process/json</code> endpoint to send raw CoNLL-U content as JSON. Include the content
|
| 190 |
+
in the <code>content</code> field of the JSON body.
|
| 191 |
+
</p>
|
| 192 |
+
<pre><code>curl -X POST -H "Content-Type: application/json" \
|
| 193 |
+
-d '{"content": "# sent_id = FOLHA_DOC000123_SENT016
|
| 194 |
+
# text = O Capitão América também bajulou o tucano.
|
| 195 |
+
1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t_\t_\t_\t_
|
| 196 |
+
2\tCapitão\tCapitão\tPROPN\t_\t_\t_\t_\t_\t_
|
| 197 |
+
3\tAmérica\tAmérica\tPROPN\t_\t_\t_\t_\t_\t_
|
| 198 |
+
4\ttambém\ttambém\tADV\t_\t_\t_\t_\t_\t_
|
| 199 |
+
5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t_\t_\t_\t_
|
| 200 |
+
6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t_\t_\t_\t_
|
| 201 |
+
7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t_\t_\t_\tSpaceAfter=No
|
| 202 |
+
8\t.\t.\tPUNCT\t_\t_\t_\t_\t_\tSpaceAfter=No"}' \
|
| 203 |
+
"http://localhost:8000/api/process/json"</code></pre>
|
| 204 |
+
|
| 205 |
+
<h4>Example JSON Response:</h4>
|
| 206 |
+
<pre><code>{
|
| 207 |
+
"status": "success",
|
| 208 |
+
"warnings": [],
|
| 209 |
+
"processed_content": "# sent_id = FOLHA_DOC000123_SENT016\n# text = O Capitão América também bajulou o tucano.\n1\tO\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t2\tdet\t_\t_\n2\tCapitão\tCapitão\tPROPN\t_\t_\t5\tnsubj\t_\t_\n3\tAmérica\tAmérica\tPROPN\t_\t_\t2\tflat:name\t_\t_\n4\ttambém\ttambém\tADV\t_\t_\t5\tadvmod\t_\t_\n5\tbajulou\tbajular\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin\t0\troot\t_\t_\n6\to\to\tDET\t_\tDefinite=Def|Gender=Masc|Number=Sing|PronType=Art\t7\tdet\t_\t_\n7\ttucano\ttucano\tNOUN\t_\tGender=Masc|Number=Sing\t5\tobj\t_\tSpaceAfter=No\n8\t.\t.\tPUNCT\t_\t_\t5\tpunct\t_\tSpaceAfter=No\n"
|
| 210 |
+
}</code></pre>
|
| 211 |
+
|
| 212 |
+
<h3>Example with Input and Output</h3>
|
| 213 |
+
|
| 214 |
+
<h4>Original Input</h4>
|
| 215 |
+
<pre><code># sent_id = FOLHA_DOC000123_SENT016
|
| 216 |
+
# text = O Capitão América também bajulou o tucano.
|
| 217 |
+
1 O o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art _ _ _ _
|
| 218 |
+
2 Capitão Capitão PROPN _ _ _ _ _ _
|
| 219 |
+
3 América América PROPN _ _ _ _ _ _
|
| 220 |
+
4 também também ADV _ _ _ _ _ _
|
| 221 |
+
5 bajulou bajular VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin _ _ _ _
|
| 222 |
+
6 o o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art _ _ _ _
|
| 223 |
+
7 tucano tucano NOUN _ Gender=Masc|Number=Sing _ _ _ SpaceAfter=No
|
| 224 |
+
8 . . PUNCT _ _ _ _ _ SpaceAfter=No</code></pre>
|
| 225 |
+
|
| 226 |
+
<h4>Processed Output</h4>
|
| 227 |
+
<pre><code># sent_id = FOLHA_DOC000123_SENT016
|
| 228 |
+
# text = O Capitão América também bajulou o tucano.
|
| 229 |
+
1 O o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 2 det _ _
|
| 230 |
+
2 Capitão Capitão PROPN _ _ 5 nsubj _ _
|
| 231 |
+
3 América América PROPN _ _ 2 flat:name _ _
|
| 232 |
+
4 também também ADV _ _ 5 advmod _ _
|
| 233 |
+
5 bajulou bajular VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 0 root _ _
|
| 234 |
+
6 o o DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 7 det _ _
|
| 235 |
+
7 tucano tucano NOUN _ Gender=Masc|Number=Sing 5 obj _ SpaceAfter=No
|
| 236 |
+
8 . . PUNCT _ _ 5 punct _ SpaceAfter=No</code></pre>
|
| 237 |
+
|
| 238 |
+
<h2>Contact</h2>
|
| 239 |
+
<p>
|
| 240 |
+
For further assistance, please <a href="https://genipapo-parser.azurewebsites.net/contact">contact us</a>.
|
| 241 |
+
</p>
|
| 242 |
+
</div>
|
| 243 |
+
|
templates/contact.html
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Contact Us - Genipapo Parser</title>
|
| 7 |
+
<style>
|
| 8 |
+
body {
|
| 9 |
+
font-family: Arial, sans-serif;
|
| 10 |
+
margin: 0;
|
| 11 |
+
padding: 0;
|
| 12 |
+
background: url('/static/geni.jpg') no-repeat center center fixed;
|
| 13 |
+
background-size: cover;
|
| 14 |
+
background-color: rgba(0, 0, 0, 0.65);
|
| 15 |
+
background-blend-mode: overlay;
|
| 16 |
+
line-height: 1.6;
|
| 17 |
+
padding: 20px;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
.header {
|
| 21 |
+
width: 100%;
|
| 22 |
+
background: rgba(3, 112, 49, 0.8);
|
| 23 |
+
padding: 10px 0;
|
| 24 |
+
position: fixed;
|
| 25 |
+
top: 0;
|
| 26 |
+
left: 0;
|
| 27 |
+
z-index: 1000;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.header nav {
|
| 31 |
+
display: flex;
|
| 32 |
+
justify-content: center;
|
| 33 |
+
gap: 1em;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.header a {
|
| 37 |
+
text-decoration: none;
|
| 38 |
+
color: white;
|
| 39 |
+
padding: 10px 15px;
|
| 40 |
+
border-radius: 5px;
|
| 41 |
+
transition: background-color 0.3s;
|
| 42 |
+
font-size: 1em;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.header a:hover, .header a.active {
|
| 46 |
+
background-color: rgba(4, 63, 28, 0.8);
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.content {
|
| 50 |
+
margin: 80px auto 0; /* Espaço abaixo do header */
|
| 51 |
+
max-width: 600px;
|
| 52 |
+
background-color: rgba(255, 255, 255, 0.8);
|
| 53 |
+
border-radius: 8px;
|
| 54 |
+
padding: 20px;
|
| 55 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
h1 {
|
| 59 |
+
text-align: center;
|
| 60 |
+
font-size: 2.5em;
|
| 61 |
+
margin-bottom: 20px;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
p {
|
| 65 |
+
margin-bottom: 1em;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
a {
|
| 69 |
+
color: #2946c5;
|
| 70 |
+
text-decoration: underline;
|
| 71 |
+
transition: color 0.3s;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
a:hover {
|
| 75 |
+
color: #5573f8;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.email, .github-link {
|
| 79 |
+
text-align: center;
|
| 80 |
+
font-size: 1.2em;
|
| 81 |
+
margin: 15px 0;
|
| 82 |
+
}
|
| 83 |
+
</style>
|
| 84 |
+
</head>
|
| 85 |
+
<body>
|
| 86 |
+
<header class="header">
|
| 87 |
+
<nav>
|
| 88 |
+
<a href="/">Genipapo</a>
|
| 89 |
+
<a href="about">About</a>
|
| 90 |
+
<a href="api_guide">API Guide</a>
|
| 91 |
+
<a href="contact" class="active">Contact Us</a>
|
| 92 |
+
</nav>
|
| 93 |
+
</header>
|
| 94 |
+
|
| 95 |
+
<div class="content">
|
| 96 |
+
<h1>Contact Us</h1>
|
| 97 |
+
<p class="email">
|
| 98 |
+
For inquiries, you can reach us at:
|
| 99 |
+
<a href="mailto:bryankhelven@ieee.org">bryankhelven@ieee.org</a>
|
| 100 |
+
</p>
|
| 101 |
+
<p class="github-link">
|
| 102 |
+
If you'd like to run the parser locally, visit our GitHub repository: <br>
|
| 103 |
+
<a href="https://github.com/bryankhelven/genipapo" target="_blank">Genipapo repository</a>
|
| 104 |
+
</p>
|
| 105 |
+
</div>
|
| 106 |
+
</body>
|
| 107 |
+
</html>
|
templates/upload_conllu.html
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<header class="header">
|
| 4 |
+
<nav>
|
| 5 |
+
<a href="/" class="active">Genipapo</a>
|
| 6 |
+
<a href="about">About</a>
|
| 7 |
+
<a href="api_guide">API Guide</a>
|
| 8 |
+
<a href="contact">Contact Us</a>
|
| 9 |
+
</nav>
|
| 10 |
+
</header>
|
| 11 |
+
<head>
|
| 12 |
+
<meta charset="UTF-8">
|
| 13 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 14 |
+
<title>Genipapo Parser</title>
|
| 15 |
+
<style>
|
| 16 |
+
body {
|
| 17 |
+
display: flex;
|
| 18 |
+
justify-content: center;
|
| 19 |
+
align-items: center;
|
| 20 |
+
height: 100vh;
|
| 21 |
+
font-family: Arial, sans-serif;
|
| 22 |
+
margin: 0;
|
| 23 |
+
background: url('/static/geni.jpg') no-repeat center center fixed;
|
| 24 |
+
background-size: cover;
|
| 25 |
+
background-color: rgba(0, 0, 0, 0.65); /* Optional overlay for translucency */
|
| 26 |
+
background-blend-mode: overlay;
|
| 27 |
+
}
|
| 28 |
+
#upload-container {
|
| 29 |
+
text-align: center;
|
| 30 |
+
background-color: rgba(255, 255, 255, 0.8); /* White background with 80% opacity */
|
| 31 |
+
padding: 2em;
|
| 32 |
+
box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1);
|
| 33 |
+
border-radius: 8px;
|
| 34 |
+
max-width: 500px;
|
| 35 |
+
}
|
| 36 |
+
#button-container {
|
| 37 |
+
display: flex;
|
| 38 |
+
justify-content: center;
|
| 39 |
+
align-items: center;
|
| 40 |
+
gap: 1em; /* Space between buttons */
|
| 41 |
+
flex-wrap: wrap; /* Allow buttons to wrap if needed */
|
| 42 |
+
}
|
| 43 |
+
.custom-file-label {
|
| 44 |
+
display: inline-block;
|
| 45 |
+
background-color: #007bff;
|
| 46 |
+
color: white;
|
| 47 |
+
padding: 0.5em 1em;
|
| 48 |
+
border-radius: 4px;
|
| 49 |
+
cursor: pointer;
|
| 50 |
+
text-align: center;
|
| 51 |
+
max-width: 200px; /* Limit width for long file names */
|
| 52 |
+
white-space: nowrap;
|
| 53 |
+
overflow: hidden;
|
| 54 |
+
text-overflow: ellipsis; /* Add "..." for long text */
|
| 55 |
+
}
|
| 56 |
+
.custom-file-label:hover {
|
| 57 |
+
background-color: #0056b3;
|
| 58 |
+
}
|
| 59 |
+
#submitBtn {
|
| 60 |
+
padding: 0.5em 2em;
|
| 61 |
+
background-color: #28a745;
|
| 62 |
+
color: white;
|
| 63 |
+
border: none;
|
| 64 |
+
cursor: pointer;
|
| 65 |
+
border-radius: 4px;
|
| 66 |
+
}
|
| 67 |
+
#submitBtn:disabled {
|
| 68 |
+
background-color: #999;
|
| 69 |
+
}
|
| 70 |
+
#message {
|
| 71 |
+
margin-top: 1em;
|
| 72 |
+
}
|
| 73 |
+
h1 {
|
| 74 |
+
font-size: 2em;
|
| 75 |
+
margin-bottom: 1em;
|
| 76 |
+
}
|
| 77 |
+
p {
|
| 78 |
+
font-size: 1.2em;
|
| 79 |
+
margin-bottom: 1em;
|
| 80 |
+
}
|
| 81 |
+
.header {
|
| 82 |
+
width: 100%;
|
| 83 |
+
background: rgba(3, 112, 49, 0.8); /* Azul similar ao botão, 80% opacidade */
|
| 84 |
+
padding: 10px 0;
|
| 85 |
+
position: fixed;
|
| 86 |
+
top: 0;
|
| 87 |
+
left: 0;
|
| 88 |
+
z-index: 1000;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
.header nav {
|
| 92 |
+
display: flex;
|
| 93 |
+
justify-content: center; /* Centraliza os botões no meio */
|
| 94 |
+
gap: 1em; /* Espaço entre os botões */
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.header a {
|
| 98 |
+
text-decoration: none;
|
| 99 |
+
color: white;
|
| 100 |
+
padding: 10px 15px;
|
| 101 |
+
border-radius: 5px;
|
| 102 |
+
transition: background-color 0.3s;
|
| 103 |
+
font-size: 1em; /* Tamanho consistente */
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
.header a:hover, .header a.active {
|
| 107 |
+
background-color: rgba(4, 63, 28, 0.8); /* Tom mais escuro no hover/seleção */
|
| 108 |
+
}
|
| 109 |
+
</style>
|
| 110 |
+
</head>
|
| 111 |
+
<body>
|
| 112 |
+
<div id="upload-container">
|
| 113 |
+
<h1>Upload a conllu File</h1>
|
| 114 |
+
<p>Please upload a conllu file that is properly structured and already has POS tags.</p>
|
| 115 |
+
|
| 116 |
+
<div id="button-container">
|
| 117 |
+
<!-- Custom file upload button -->
|
| 118 |
+
<label for="fileInput" class="custom-file-label">Choose File</label>
|
| 119 |
+
<input type="file" id="fileInput" accept=".conllu" hidden>
|
| 120 |
+
|
| 121 |
+
<!-- Submit button -->
|
| 122 |
+
<button id="submitBtn" onclick="uploadFile()">Upload and Process</button>
|
| 123 |
+
</div>
|
| 124 |
+
|
| 125 |
+
<div id="message"></div>
|
| 126 |
+
</div>
|
| 127 |
+
|
| 128 |
+
<script>
|
| 129 |
+
const fileInput = document.getElementById('fileInput');
|
| 130 |
+
const customFileLabel = document.querySelector('.custom-file-label');
|
| 131 |
+
|
| 132 |
+
// Update the custom button text when a file is selected
|
| 133 |
+
fileInput.addEventListener('change', function () {
|
| 134 |
+
if (fileInput.files.length > 0) {
|
| 135 |
+
customFileLabel.textContent = fileInput.files[0].name;
|
| 136 |
+
}
|
| 137 |
+
});
|
| 138 |
+
</script>
|
| 139 |
+
</body>
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
<script>
|
| 144 |
+
async function uploadFile() {
|
| 145 |
+
const fileInput = document.getElementById('fileInput');
|
| 146 |
+
const messageDiv = document.getElementById('message');
|
| 147 |
+
const submitBtn = document.getElementById('submitBtn');
|
| 148 |
+
|
| 149 |
+
if (!fileInput.files.length) {
|
| 150 |
+
messageDiv.textContent = 'Please select a .conllu file.';
|
| 151 |
+
return;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
const file = fileInput.files[0];
|
| 155 |
+
if (!file.name.endsWith('.conllu')) {
|
| 156 |
+
messageDiv.textContent = 'Only .conllu files are allowed.';
|
| 157 |
+
return;
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
messageDiv.textContent = 'Checking file...';
|
| 161 |
+
submitBtn.disabled = true;
|
| 162 |
+
|
| 163 |
+
// Prepare the file for upload
|
| 164 |
+
const formData = new FormData();
|
| 165 |
+
formData.append('file', file);
|
| 166 |
+
|
| 167 |
+
try {
|
| 168 |
+
// Display progress message
|
| 169 |
+
messageDiv.textContent = 'File is being processed...';
|
| 170 |
+
|
| 171 |
+
const response = await fetch('/', {
|
| 172 |
+
method: 'POST',
|
| 173 |
+
body: formData
|
| 174 |
+
});
|
| 175 |
+
|
| 176 |
+
if (response.ok) {
|
| 177 |
+
// Get warnings from headers
|
| 178 |
+
const warningsHeader = response.headers.get('X-Warnings');
|
| 179 |
+
if (warningsHeader) {
|
| 180 |
+
// URL-decode the warnings string
|
| 181 |
+
const warningsDecoded = decodeURIComponent(warningsHeader);
|
| 182 |
+
// Split the warnings back into an array
|
| 183 |
+
const warningsArray = warningsDecoded.split('\n');
|
| 184 |
+
// Display warnings to the user
|
| 185 |
+
const warningsHtml = warningsArray.map(warning => `<div>${warning}</div>`).join('');
|
| 186 |
+
messageDiv.innerHTML = `<div>Warnings:</div>${warningsHtml}`;
|
| 187 |
+
} else {
|
| 188 |
+
messageDiv.textContent = 'File processed successfully.';
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
// Handle the file download
|
| 192 |
+
const blob = await response.blob();
|
| 193 |
+
const downloadUrl = URL.createObjectURL(blob);
|
| 194 |
+
const a = document.createElement('a');
|
| 195 |
+
a.href = downloadUrl;
|
| 196 |
+
a.download = `${file.name.replace('.conllu', '')}_parsed.conllu`;
|
| 197 |
+
document.body.appendChild(a);
|
| 198 |
+
a.click();
|
| 199 |
+
a.remove();
|
| 200 |
+
} else {
|
| 201 |
+
// Read the error message from the response body
|
| 202 |
+
const responseText = await response.text();
|
| 203 |
+
// Display the error message
|
| 204 |
+
messageDiv.innerHTML = `Error: ${responseText}`;
|
| 205 |
+
}
|
| 206 |
+
} catch (error) {
|
| 207 |
+
messageDiv.textContent = `Error: ${error.message}`;
|
| 208 |
+
} finally {
|
| 209 |
+
submitBtn.disabled = false;
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
</script>
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
</body>
|
| 216 |
+
</html>
|