File size: 6,533 Bytes
f042f29
 
461f0ea
f042f29
 
 
 
 
 
 
 
 
 
 
 
 
 
461f0ea
f042f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95fc20d
f042f29
 
 
461f0ea
f042f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461f0ea
f042f29
 
461f0ea
f042f29
461f0ea
f042f29
 
461f0ea
f042f29
 
 
461f0ea
 
f042f29
461f0ea
f042f29
 
 
 
461f0ea
 
f042f29
461f0ea
f042f29
 
461f0ea
f042f29
 
 
 
 
 
 
 
 
 
461f0ea
f042f29
461f0ea
f042f29
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
from __future__ import annotations

import csv
import hashlib
import os
import tempfile
from pathlib import Path

from flask import Flask, after_this_request, render_template, request, send_file
from werkzeug.utils import secure_filename

from pipeline import labels2attrs, run_pipeline

app = Flask(__name__, template_folder="templates", static_folder="templates")

UPLOAD_DIR = Path("uploads")
UPLOAD_DIR.mkdir(exist_ok=True)

ALLOWED_EXTENSIONS = {"txt", "csv"}


def allowed_file(filename: str) -> bool:
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS


def iter_texts_from_file(file_path: Path):
    suffix = file_path.suffix.lower()

    if suffix == ".txt":
        with file_path.open("r", encoding="utf-8-sig", errors="ignore") as f:
            for line in f:
                text = line.strip()
                if text:
                    yield text

    elif suffix == ".csv":
        with file_path.open("r", encoding="utf-8-sig", errors="ignore", newline="") as f:
            reader = csv.reader(f)
            for row in reader:
                if row and row[0].strip():
                    yield row[0].strip()


def process_file(file_path: Path):
    results = []
    for text in iter_texts_from_file(file_path):
        results.append({"input": text, "output": run_pipeline(text)})
    return results


def summarize_result(result_rows):
    counts = {}
    individual_labels = {}
    proportions = {}

    for idx, item in enumerate(result_rows):
        individual_labels[idx] = []
        counts[idx] = {
            "generic": 0,
            "specific": 0,
            "stative": 0,
            "dynamic": 0,
            "static": 0,
            "episodic": 0,
            "habitual": 0,
            "NA genericity": 0,
            "NA eventivity": 0,
            "NA boundedness": 0,
        }

        for clause_text, label_name in item["output"][1]:
            individual_labels[idx].append(label_name)
            attrs = labels2attrs[label_name]

            for attr_index, feature in enumerate(attrs):
                if "NA" not in feature:
                    counts[idx][feature] += 1
                elif attr_index == 0:
                    counts[idx]["NA genericity"] += 1
                elif attr_index == 1:
                    counts[idx]["NA eventivity"] += 1
                else:
                    counts[idx]["NA boundedness"] += 1

        gen_total = (
            counts[idx]["generic"]
            + counts[idx]["specific"]
            + counts[idx]["NA genericity"]
        )
        evt_total = (
            counts[idx]["stative"]
            + counts[idx]["dynamic"]
            + counts[idx]["NA eventivity"]
        )
        bnd_total = (
            counts[idx]["static"]
            + counts[idx]["episodic"]
            + counts[idx]["habitual"]
            + counts[idx]["NA boundedness"]
        )

        proportions[idx] = [
            counts[idx]["generic"] / gen_total if gen_total else 0.0,
            counts[idx]["specific"] / gen_total if gen_total else 0.0,
            counts[idx]["stative"] / evt_total if evt_total else 0.0,
            counts[idx]["dynamic"] / evt_total if evt_total else 0.0,
            counts[idx]["static"] / bnd_total if bnd_total else 0.0,
            counts[idx]["episodic"] / bnd_total if bnd_total else 0.0,
            counts[idx]["habitual"] / bnd_total if bnd_total else 0.0,
        ]

    return counts, individual_labels, proportions


def write_results_csv(result_rows) -> str:
    counts, individual_labels, proportions = summarize_result(result_rows)

    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    tmp_path = tmp.name
    tmp.close()

    with open(tmp_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "input",
            "clauses",
            "individual labels",
            "genericity: generic count",
            "genericity: specific count",
            "eventivity: stative count",
            "eventivity: dynamic count",
            "boundedness: static count",
            "boundedness: episodic count",
            "habitual count",
            "genericity: proportion generic",
            "genericity: proportion specific",
            "eventivity: proportion stative",
            "eventivity: proportion dynamic",
            "boundedness: proportion static",
            "boundedness: proportion episodic",
            "proportion habitual",
        ])

        for idx, item in enumerate(result_rows):
            clauses = "\n".join(
                f"{clause_id}: {clause_text}"
                for clause_text, clause_id in item["output"][0]
            )
            label_lines = "\n".join(individual_labels[idx])

            row = [
                item["input"],
                clauses,
                label_lines,
                counts[idx]["generic"],
                counts[idx]["specific"],
                counts[idx]["stative"],
                counts[idx]["dynamic"],
                counts[idx]["static"],
                counts[idx]["episodic"],
                counts[idx]["habitual"],
                *proportions[idx],
            ]
            writer.writerow(row)

    return tmp_path


@app.route("/", methods=["GET"])
def index():
    return render_template("index.html")


@app.route("/", methods=["POST"])
def upload_file():
    if "file" not in request.files:
        return "No file selected", 400

    file = request.files["file"]
    if not file or file.filename == "":
        return "No file selected", 400

    if not allowed_file(file.filename):
        return "File type not allowed", 400

    original_name = secure_filename(file.filename)
    suffix = Path(original_name).suffix.lower()
    file_hash = hashlib.md5(file.read()).hexdigest()
    saved_path = UPLOAD_DIR / f"{file_hash}{suffix}"

    file.seek(0)
    file.save(saved_path)

    result_rows = process_file(saved_path)
    output_csv = write_results_csv(result_rows)

    @after_this_request
    def cleanup(response):
        try:
            if saved_path.exists():
                saved_path.unlink()
            if os.path.exists(output_csv):
                os.remove(output_csv)
        except Exception:
            app.logger.exception("Failed to clean up temp files")
        return response

    return send_file(output_csv, as_attachment=True, download_name="result.csv")


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860, debug=False)