Spaces:

clementBE
/

transcrib_coder

Sleeping

File size: 5,794 Bytes

import gradio as gr
import pandas as pd
import os

try:
    import docx
except ImportError:
    docx = None

# --- Default codes and metadata ---
DEFAULT_CODES = [
    "Travail",          # work, employment
    "Famille",          # family
    "Formation",        # education, training
    "Association",      # community / associations
    "Santé",            # health
    "Politique",        # politics
    "Loisir",           # leisure
    "Religion",         # religion / spiritual
    "Émigration",       # migration
    "Autre",            # other / miscellaneous
]


METADATA_FIELDS = {
    "interview_id": "ID de l'entretien",
    "interview_date": "Date de l'entretien",
    "occupation": "Profession",
    "age": "Âge",
}


COLOR_MAP = {
    "Travail": "lightblue",
    "Famille": "lightgreen",
    "Formation": "khaki",
    "Association": "orange",
    "Santé": "lightpink",
    "Politique": "violet",
    "Loisir": "lightcoral",
    "Religion": "lightyellow",
    "Émigration": "lightcyan",
    "Autre": "gray",
}

# --- File processing ---
def read_docx(path):
    if not docx:
        return "Error: python-docx not installed."
    d = docx.Document(path)
    return "\n".join([p.text for p in d.paragraphs])

def read_vtt(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.read().split("\n")
    cleaned = [
        l.strip()
        for l in lines
        if l and "WEBVTT" not in l and "-->" not in l and not l.strip().isdigit()
    ]
    return " ".join(cleaned)

def get_empty_df():
    return pd.DataFrame(
        columns=["File ID", "Coded Segment", "Code"] + list(METADATA_FIELDS.keys())
    )

def process_file(file_obj):
    if file_obj is None:
        return "", "", get_empty_df()
    path = file_obj.name
    name = os.path.basename(path)
    if name.lower().endswith(".docx"):
        text = read_docx(path)
    elif name.lower().endswith(".vtt"):
        text = read_vtt(path)
    else:
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
    return text, name, get_empty_df()

# --- Apply coding ---
def apply_code(df, segment, code, file_id, *metadata_values):
    if not file_id:
        return df, "⚠️ Upload a file first", gr.update(value="")
    if not segment:
        return df, "⚠️ Paste a segment first", gr.update(value="")
    if not code:
        return df, "⚠️ Select a code", gr.update(value="")
    
    meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
    new_row = {"File ID": file_id, "Coded Segment": segment, "Code": code, **meta_dict}
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    
    # Clear segment box after applying
    return df, f"✅ Segment coded as '{code}'", gr.update(value="")

# --- Add new code ---
def add_new_code(new_code, code_list):
    if new_code and new_code not in code_list:
        code_list.append(new_code)
    return code_list

# --- Export to Excel ---
def export_excel(df):
    if df.empty:
        return None, "Nothing to export"
    path = "coded_segments.xlsx"
    df.to_excel(path, index=False)
    return path, "Excel ready"

# ----------------------------
# GRADIO APP
# ----------------------------
with gr.Blocks() as demo:

    # --- States ---
    full_text = gr.State("")
    file_id = gr.State("")
    coded_df_state = gr.State(get_empty_df())
    code_categories_state = gr.State(DEFAULT_CODES)

    # --- Metadata on top ---
    with gr.Row():
        metadata_inputs = []
        for k, lbl in METADATA_FIELDS.items():
            metadata_inputs.append(gr.Textbox(label=lbl))

    # --- Main interface ---
    with gr.Row():
        # Left: transcript
        with gr.Column(scale=3):
            transcript_box = gr.Textbox(
                label="Transcript (copy the text you want to code)",
                lines=25,
                interactive=True,
                placeholder="Upload a file to see transcript..."
            )

        # Right: coding tools
        with gr.Column(scale=2):
            gr.Markdown("## 🏷️ Code Segment")
            segment_box = gr.Textbox(
                label="Segment to code (paste here)",
                lines=4,
            )
            code_dropdown = gr.Dropdown(label="Select code", choices=DEFAULT_CODES)
            code_input = gr.Textbox(label="Or type new code")
            add_code_btn = gr.Button("Add new code")
            apply_btn = gr.Button("Apply code")

            gr.Markdown("## 📊 Coded Segments")
            table = gr.Dataframe(interactive=False)

            export_btn = gr.Button("Export XLSX")
            export_file = gr.File(visible=False)

            file_input = gr.File(label="Upload transcript", file_types=[".docx", ".vtt", ".txt"])
            status = gr.Textbox(label="Status", value="Ready")

    # --- Callbacks ---
    file_input.change(
        fn=process_file,
        inputs=file_input,
        outputs=[transcript_box, file_id, coded_df_state]
    )

    add_code_btn.click(
        add_new_code,
        inputs=[code_input, code_categories_state],
        outputs=[code_categories_state]
    )

    code_categories_state.change(
        lambda codes: gr.update(choices=codes),
        inputs=code_categories_state,
        outputs=code_dropdown
    )

    apply_btn.click(
        apply_code,
        inputs=[coded_df_state, segment_box, code_dropdown, file_id] + metadata_inputs,
        outputs=[coded_df_state, status, segment_box]
    )

    coded_df_state.change(lambda df: df, inputs=coded_df_state, outputs=table)

    export_btn.click(
        export_excel,
        inputs=coded_df_state,
        outputs=[export_file, status]
    ).then(
        lambda f: gr.update(visible=f is not None),
        inputs=export_file,
        outputs=export_file
    )

demo.launch()