File size: 5,794 Bytes
20a147b
 
af98792
9433a59
af98792
 
 
 
20a147b
2415814
ad6113a
07da78f
 
 
 
 
 
 
 
 
 
af98792
20a147b
07da78f
3dcb04f
 
 
 
 
 
 
 
07da78f
 
 
 
 
 
 
 
 
 
 
af98792
 
2415814
af98792
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2415814
af98792
c0f65ca
8331209
c0f65ca
8331209
c0f65ca
8331209
 
af98792
 
4741434
8331209
 
 
af98792
2415814
70f6e49
ad6113a
 
 
 
2415814
af98792
 
 
 
 
 
2aa6081
1359c1b
 
 
97ab6fa
c0f65ca
2415814
af98792
 
 
ad6113a
af98792
2415814
2aa6081
af98792
97ab6fa
af98792
 
2415814
af98792
111da6d
b9fd23f
2415814
 
 
 
 
 
af98792
2415814
913d3b3
2415814
 
 
 
 
70f6e49
 
 
2415814
70f6e49
 
af98792
 
 
 
 
1359c1b
af98792
 
2415814
111da6d
2415814
 
 
111da6d
 
2415814
 
 
 
 
af98792
2415814
 
 
 
 
70f6e49
97ab6fa
 
2415814
8331209
97ab6fa
70f6e49
af98792
111da6d
2415814
 
 
 
 
af98792
 
1359c1b
af98792
2aa6081
4741434
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import gradio as gr
import pandas as pd
import os

try:
    import docx
except ImportError:
    docx = None

# --- Default codes and metadata ---
DEFAULT_CODES = [
    "Travail",          # work, employment
    "Famille",          # family
    "Formation",        # education, training
    "Association",      # community / associations
    "Santé",            # health
    "Politique",        # politics
    "Loisir",           # leisure
    "Religion",         # religion / spiritual
    "Émigration",       # migration
    "Autre",            # other / miscellaneous
]


METADATA_FIELDS = {
    "interview_id": "ID de l'entretien",
    "interview_date": "Date de l'entretien",
    "occupation": "Profession",
    "age": "Âge",
}


COLOR_MAP = {
    "Travail": "lightblue",
    "Famille": "lightgreen",
    "Formation": "khaki",
    "Association": "orange",
    "Santé": "lightpink",
    "Politique": "violet",
    "Loisir": "lightcoral",
    "Religion": "lightyellow",
    "Émigration": "lightcyan",
    "Autre": "gray",
}

# --- File processing ---
def read_docx(path):
    if not docx:
        return "Error: python-docx not installed."
    d = docx.Document(path)
    return "\n".join([p.text for p in d.paragraphs])

def read_vtt(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.read().split("\n")
    cleaned = [
        l.strip()
        for l in lines
        if l and "WEBVTT" not in l and "-->" not in l and not l.strip().isdigit()
    ]
    return " ".join(cleaned)

def get_empty_df():
    return pd.DataFrame(
        columns=["File ID", "Coded Segment", "Code"] + list(METADATA_FIELDS.keys())
    )

def process_file(file_obj):
    if file_obj is None:
        return "", "", get_empty_df()
    path = file_obj.name
    name = os.path.basename(path)
    if name.lower().endswith(".docx"):
        text = read_docx(path)
    elif name.lower().endswith(".vtt"):
        text = read_vtt(path)
    else:
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
    return text, name, get_empty_df()

# --- Apply coding ---
def apply_code(df, segment, code, file_id, *metadata_values):
    if not file_id:
        return df, "⚠️ Upload a file first", gr.update(value="")
    if not segment:
        return df, "⚠️ Paste a segment first", gr.update(value="")
    if not code:
        return df, "⚠️ Select a code", gr.update(value="")
    
    meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
    new_row = {"File ID": file_id, "Coded Segment": segment, "Code": code, **meta_dict}
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    
    # Clear segment box after applying
    return df, f"✅ Segment coded as '{code}'", gr.update(value="")

# --- Add new code ---
def add_new_code(new_code, code_list):
    if new_code and new_code not in code_list:
        code_list.append(new_code)
    return code_list

# --- Export to Excel ---
def export_excel(df):
    if df.empty:
        return None, "Nothing to export"
    path = "coded_segments.xlsx"
    df.to_excel(path, index=False)
    return path, "Excel ready"

# ----------------------------
# GRADIO APP
# ----------------------------
with gr.Blocks() as demo:

    # --- States ---
    full_text = gr.State("")
    file_id = gr.State("")
    coded_df_state = gr.State(get_empty_df())
    code_categories_state = gr.State(DEFAULT_CODES)

    # --- Metadata on top ---
    with gr.Row():
        metadata_inputs = []
        for k, lbl in METADATA_FIELDS.items():
            metadata_inputs.append(gr.Textbox(label=lbl))

    # --- Main interface ---
    with gr.Row():
        # Left: transcript
        with gr.Column(scale=3):
            transcript_box = gr.Textbox(
                label="Transcript (copy the text you want to code)",
                lines=25,
                interactive=True,
                placeholder="Upload a file to see transcript..."
            )

        # Right: coding tools
        with gr.Column(scale=2):
            gr.Markdown("## 🏷️ Code Segment")
            segment_box = gr.Textbox(
                label="Segment to code (paste here)",
                lines=4,
            )
            code_dropdown = gr.Dropdown(label="Select code", choices=DEFAULT_CODES)
            code_input = gr.Textbox(label="Or type new code")
            add_code_btn = gr.Button("Add new code")
            apply_btn = gr.Button("Apply code")

            gr.Markdown("## 📊 Coded Segments")
            table = gr.Dataframe(interactive=False)

            export_btn = gr.Button("Export XLSX")
            export_file = gr.File(visible=False)

            file_input = gr.File(label="Upload transcript", file_types=[".docx", ".vtt", ".txt"])
            status = gr.Textbox(label="Status", value="Ready")

    # --- Callbacks ---
    file_input.change(
        fn=process_file,
        inputs=file_input,
        outputs=[transcript_box, file_id, coded_df_state]
    )

    add_code_btn.click(
        add_new_code,
        inputs=[code_input, code_categories_state],
        outputs=[code_categories_state]
    )

    code_categories_state.change(
        lambda codes: gr.update(choices=codes),
        inputs=code_categories_state,
        outputs=code_dropdown
    )

    apply_btn.click(
        apply_code,
        inputs=[coded_df_state, segment_box, code_dropdown, file_id] + metadata_inputs,
        outputs=[coded_df_state, status, segment_box]
    )

    coded_df_state.change(lambda df: df, inputs=coded_df_state, outputs=table)

    export_btn.click(
        export_excel,
        inputs=coded_df_state,
        outputs=[export_file, status]
    ).then(
        lambda f: gr.update(visible=f is not None),
        inputs=export_file,
        outputs=export_file
    )

demo.launch()