Spaces:

clementBE
/

transcrib_coder

Sleeping

App Files Files Community

transcrib_coder / app.py

clementBE

Update app.py

3dcb04f verified 2 months ago

raw

history blame contribute delete

5.79 kB

	import gradio as gr
	import pandas as pd
	import os

	try:
	import docx
	except ImportError:
	docx = None

	# --- Default codes and metadata ---
	DEFAULT_CODES = [
	"Travail", # work, employment
	"Famille", # family
	"Formation", # education, training
	"Association", # community / associations
	"Santé", # health
	"Politique", # politics
	"Loisir", # leisure
	"Religion", # religion / spiritual
	"Émigration", # migration
	"Autre", # other / miscellaneous
	]


	METADATA_FIELDS = {
	"interview_id": "ID de l'entretien",
	"interview_date": "Date de l'entretien",
	"occupation": "Profession",
	"age": "Âge",
	}


	COLOR_MAP = {
	"Travail": "lightblue",
	"Famille": "lightgreen",
	"Formation": "khaki",
	"Association": "orange",
	"Santé": "lightpink",
	"Politique": "violet",
	"Loisir": "lightcoral",
	"Religion": "lightyellow",
	"Émigration": "lightcyan",
	"Autre": "gray",
	}

	# --- File processing ---
	def read_docx(path):
	if not docx:
	return "Error: python-docx not installed."
	d = docx.Document(path)
	return "\n".join([p.text for p in d.paragraphs])

	def read_vtt(path):
	with open(path, "r", encoding="utf-8") as f:
	lines = f.read().split("\n")
	cleaned = [
	l.strip()
	for l in lines
	if l and "WEBVTT" not in l and "-->" not in l and not l.strip().isdigit()
	]
	return " ".join(cleaned)

	def get_empty_df():
	return pd.DataFrame(
	columns=["File ID", "Coded Segment", "Code"] + list(METADATA_FIELDS.keys())
	)

	def process_file(file_obj):
	if file_obj is None:
	return "", "", get_empty_df()
	path = file_obj.name
	name = os.path.basename(path)
	if name.lower().endswith(".docx"):
	text = read_docx(path)
	elif name.lower().endswith(".vtt"):
	text = read_vtt(path)
	else:
	with open(path, "r", encoding="utf-8") as f:
	text = f.read()
	return text, name, get_empty_df()

	# --- Apply coding ---
	def apply_code(df, segment, code, file_id, *metadata_values):
	if not file_id:
	return df, "⚠️ Upload a file first", gr.update(value="")
	if not segment:
	return df, "⚠️ Paste a segment first", gr.update(value="")
	if not code:
	return df, "⚠️ Select a code", gr.update(value="")

	meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
	new_row = {"File ID": file_id, "Coded Segment": segment, "Code": code, **meta_dict}
	df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

	# Clear segment box after applying
	return df, f"✅ Segment coded as '{code}'", gr.update(value="")

	# --- Add new code ---
	def add_new_code(new_code, code_list):
	if new_code and new_code not in code_list:
	code_list.append(new_code)
	return code_list

	# --- Export to Excel ---
	def export_excel(df):
	if df.empty:
	return None, "Nothing to export"
	path = "coded_segments.xlsx"
	df.to_excel(path, index=False)
	return path, "Excel ready"

	# ----------------------------
	# GRADIO APP
	# ----------------------------
	with gr.Blocks() as demo:

	# --- States ---
	full_text = gr.State("")
	file_id = gr.State("")
	coded_df_state = gr.State(get_empty_df())
	code_categories_state = gr.State(DEFAULT_CODES)

	# --- Metadata on top ---
	with gr.Row():
	metadata_inputs = []
	for k, lbl in METADATA_FIELDS.items():
	metadata_inputs.append(gr.Textbox(label=lbl))

	# --- Main interface ---
	with gr.Row():
	# Left: transcript
	with gr.Column(scale=3):
	transcript_box = gr.Textbox(
	label="Transcript (copy the text you want to code)",
	lines=25,
	interactive=True,
	placeholder="Upload a file to see transcript..."
	)

	# Right: coding tools
	with gr.Column(scale=2):
	gr.Markdown("## 🏷️ Code Segment")
	segment_box = gr.Textbox(
	label="Segment to code (paste here)",
	lines=4,
	)
	code_dropdown = gr.Dropdown(label="Select code", choices=DEFAULT_CODES)
	code_input = gr.Textbox(label="Or type new code")
	add_code_btn = gr.Button("Add new code")
	apply_btn = gr.Button("Apply code")

	gr.Markdown("## 📊 Coded Segments")
	table = gr.Dataframe(interactive=False)

	export_btn = gr.Button("Export XLSX")
	export_file = gr.File(visible=False)

	file_input = gr.File(label="Upload transcript", file_types=[".docx", ".vtt", ".txt"])
	status = gr.Textbox(label="Status", value="Ready")

	# --- Callbacks ---
	file_input.change(
	fn=process_file,
	inputs=file_input,
	outputs=[transcript_box, file_id, coded_df_state]
	)

	add_code_btn.click(
	add_new_code,
	inputs=[code_input, code_categories_state],
	outputs=[code_categories_state]
	)

	code_categories_state.change(
	lambda codes: gr.update(choices=codes),
	inputs=code_categories_state,
	outputs=code_dropdown
	)

	apply_btn.click(
	apply_code,
	inputs=[coded_df_state, segment_box, code_dropdown, file_id] + metadata_inputs,
	outputs=[coded_df_state, status, segment_box]
	)

	coded_df_state.change(lambda df: df, inputs=coded_df_state, outputs=table)

	export_btn.click(
	export_excel,
	inputs=coded_df_state,
	outputs=[export_file, status]
	).then(
	lambda f: gr.update(visible=f is not None),
	inputs=export_file,
	outputs=export_file
	)

	demo.launch()