transcrib_coder / app.py
clementBE's picture
Update app.py
3dcb04f verified
import gradio as gr
import pandas as pd
import os
try:
import docx
except ImportError:
docx = None
# --- Default codes and metadata ---
DEFAULT_CODES = [
"Travail", # work, employment
"Famille", # family
"Formation", # education, training
"Association", # community / associations
"Santé", # health
"Politique", # politics
"Loisir", # leisure
"Religion", # religion / spiritual
"Émigration", # migration
"Autre", # other / miscellaneous
]
METADATA_FIELDS = {
"interview_id": "ID de l'entretien",
"interview_date": "Date de l'entretien",
"occupation": "Profession",
"age": "Âge",
}
COLOR_MAP = {
"Travail": "lightblue",
"Famille": "lightgreen",
"Formation": "khaki",
"Association": "orange",
"Santé": "lightpink",
"Politique": "violet",
"Loisir": "lightcoral",
"Religion": "lightyellow",
"Émigration": "lightcyan",
"Autre": "gray",
}
# --- File processing ---
def read_docx(path):
if not docx:
return "Error: python-docx not installed."
d = docx.Document(path)
return "\n".join([p.text for p in d.paragraphs])
def read_vtt(path):
with open(path, "r", encoding="utf-8") as f:
lines = f.read().split("\n")
cleaned = [
l.strip()
for l in lines
if l and "WEBVTT" not in l and "-->" not in l and not l.strip().isdigit()
]
return " ".join(cleaned)
def get_empty_df():
return pd.DataFrame(
columns=["File ID", "Coded Segment", "Code"] + list(METADATA_FIELDS.keys())
)
def process_file(file_obj):
if file_obj is None:
return "", "", get_empty_df()
path = file_obj.name
name = os.path.basename(path)
if name.lower().endswith(".docx"):
text = read_docx(path)
elif name.lower().endswith(".vtt"):
text = read_vtt(path)
else:
with open(path, "r", encoding="utf-8") as f:
text = f.read()
return text, name, get_empty_df()
# --- Apply coding ---
def apply_code(df, segment, code, file_id, *metadata_values):
if not file_id:
return df, "⚠️ Upload a file first", gr.update(value="")
if not segment:
return df, "⚠️ Paste a segment first", gr.update(value="")
if not code:
return df, "⚠️ Select a code", gr.update(value="")
meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
new_row = {"File ID": file_id, "Coded Segment": segment, "Code": code, **meta_dict}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
# Clear segment box after applying
return df, f"✅ Segment coded as '{code}'", gr.update(value="")
# --- Add new code ---
def add_new_code(new_code, code_list):
if new_code and new_code not in code_list:
code_list.append(new_code)
return code_list
# --- Export to Excel ---
def export_excel(df):
if df.empty:
return None, "Nothing to export"
path = "coded_segments.xlsx"
df.to_excel(path, index=False)
return path, "Excel ready"
# ----------------------------
# GRADIO APP
# ----------------------------
with gr.Blocks() as demo:
# --- States ---
full_text = gr.State("")
file_id = gr.State("")
coded_df_state = gr.State(get_empty_df())
code_categories_state = gr.State(DEFAULT_CODES)
# --- Metadata on top ---
with gr.Row():
metadata_inputs = []
for k, lbl in METADATA_FIELDS.items():
metadata_inputs.append(gr.Textbox(label=lbl))
# --- Main interface ---
with gr.Row():
# Left: transcript
with gr.Column(scale=3):
transcript_box = gr.Textbox(
label="Transcript (copy the text you want to code)",
lines=25,
interactive=True,
placeholder="Upload a file to see transcript..."
)
# Right: coding tools
with gr.Column(scale=2):
gr.Markdown("## 🏷️ Code Segment")
segment_box = gr.Textbox(
label="Segment to code (paste here)",
lines=4,
)
code_dropdown = gr.Dropdown(label="Select code", choices=DEFAULT_CODES)
code_input = gr.Textbox(label="Or type new code")
add_code_btn = gr.Button("Add new code")
apply_btn = gr.Button("Apply code")
gr.Markdown("## 📊 Coded Segments")
table = gr.Dataframe(interactive=False)
export_btn = gr.Button("Export XLSX")
export_file = gr.File(visible=False)
file_input = gr.File(label="Upload transcript", file_types=[".docx", ".vtt", ".txt"])
status = gr.Textbox(label="Status", value="Ready")
# --- Callbacks ---
file_input.change(
fn=process_file,
inputs=file_input,
outputs=[transcript_box, file_id, coded_df_state]
)
add_code_btn.click(
add_new_code,
inputs=[code_input, code_categories_state],
outputs=[code_categories_state]
)
code_categories_state.change(
lambda codes: gr.update(choices=codes),
inputs=code_categories_state,
outputs=code_dropdown
)
apply_btn.click(
apply_code,
inputs=[coded_df_state, segment_box, code_dropdown, file_id] + metadata_inputs,
outputs=[coded_df_state, status, segment_box]
)
coded_df_state.change(lambda df: df, inputs=coded_df_state, outputs=table)
export_btn.click(
export_excel,
inputs=coded_df_state,
outputs=[export_file, status]
).then(
lambda f: gr.update(visible=f is not None),
inputs=export_file,
outputs=export_file
)
demo.launch()