Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import os | |
| try: | |
| import docx | |
| except ImportError: | |
| docx = None | |
| # --- Default codes and metadata --- | |
| DEFAULT_CODES = [ | |
| "Travail", # work, employment | |
| "Famille", # family | |
| "Formation", # education, training | |
| "Association", # community / associations | |
| "Santé", # health | |
| "Politique", # politics | |
| "Loisir", # leisure | |
| "Religion", # religion / spiritual | |
| "Émigration", # migration | |
| "Autre", # other / miscellaneous | |
| ] | |
| METADATA_FIELDS = { | |
| "interview_id": "ID de l'entretien", | |
| "interview_date": "Date de l'entretien", | |
| "occupation": "Profession", | |
| "age": "Âge", | |
| } | |
| COLOR_MAP = { | |
| "Travail": "lightblue", | |
| "Famille": "lightgreen", | |
| "Formation": "khaki", | |
| "Association": "orange", | |
| "Santé": "lightpink", | |
| "Politique": "violet", | |
| "Loisir": "lightcoral", | |
| "Religion": "lightyellow", | |
| "Émigration": "lightcyan", | |
| "Autre": "gray", | |
| } | |
| # --- File processing --- | |
| def read_docx(path): | |
| if not docx: | |
| return "Error: python-docx not installed." | |
| d = docx.Document(path) | |
| return "\n".join([p.text for p in d.paragraphs]) | |
| def read_vtt(path): | |
| with open(path, "r", encoding="utf-8") as f: | |
| lines = f.read().split("\n") | |
| cleaned = [ | |
| l.strip() | |
| for l in lines | |
| if l and "WEBVTT" not in l and "-->" not in l and not l.strip().isdigit() | |
| ] | |
| return " ".join(cleaned) | |
| def get_empty_df(): | |
| return pd.DataFrame( | |
| columns=["File ID", "Coded Segment", "Code"] + list(METADATA_FIELDS.keys()) | |
| ) | |
| def process_file(file_obj): | |
| if file_obj is None: | |
| return "", "", get_empty_df() | |
| path = file_obj.name | |
| name = os.path.basename(path) | |
| if name.lower().endswith(".docx"): | |
| text = read_docx(path) | |
| elif name.lower().endswith(".vtt"): | |
| text = read_vtt(path) | |
| else: | |
| with open(path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| return text, name, get_empty_df() | |
| # --- Apply coding --- | |
| def apply_code(df, segment, code, file_id, *metadata_values): | |
| if not file_id: | |
| return df, "⚠️ Upload a file first", gr.update(value="") | |
| if not segment: | |
| return df, "⚠️ Paste a segment first", gr.update(value="") | |
| if not code: | |
| return df, "⚠️ Select a code", gr.update(value="") | |
| meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values)) | |
| new_row = {"File ID": file_id, "Coded Segment": segment, "Code": code, **meta_dict} | |
| df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) | |
| # Clear segment box after applying | |
| return df, f"✅ Segment coded as '{code}'", gr.update(value="") | |
| # --- Add new code --- | |
| def add_new_code(new_code, code_list): | |
| if new_code and new_code not in code_list: | |
| code_list.append(new_code) | |
| return code_list | |
| # --- Export to Excel --- | |
| def export_excel(df): | |
| if df.empty: | |
| return None, "Nothing to export" | |
| path = "coded_segments.xlsx" | |
| df.to_excel(path, index=False) | |
| return path, "Excel ready" | |
| # ---------------------------- | |
| # GRADIO APP | |
| # ---------------------------- | |
| with gr.Blocks() as demo: | |
| # --- States --- | |
| full_text = gr.State("") | |
| file_id = gr.State("") | |
| coded_df_state = gr.State(get_empty_df()) | |
| code_categories_state = gr.State(DEFAULT_CODES) | |
| # --- Metadata on top --- | |
| with gr.Row(): | |
| metadata_inputs = [] | |
| for k, lbl in METADATA_FIELDS.items(): | |
| metadata_inputs.append(gr.Textbox(label=lbl)) | |
| # --- Main interface --- | |
| with gr.Row(): | |
| # Left: transcript | |
| with gr.Column(scale=3): | |
| transcript_box = gr.Textbox( | |
| label="Transcript (copy the text you want to code)", | |
| lines=25, | |
| interactive=True, | |
| placeholder="Upload a file to see transcript..." | |
| ) | |
| # Right: coding tools | |
| with gr.Column(scale=2): | |
| gr.Markdown("## 🏷️ Code Segment") | |
| segment_box = gr.Textbox( | |
| label="Segment to code (paste here)", | |
| lines=4, | |
| ) | |
| code_dropdown = gr.Dropdown(label="Select code", choices=DEFAULT_CODES) | |
| code_input = gr.Textbox(label="Or type new code") | |
| add_code_btn = gr.Button("Add new code") | |
| apply_btn = gr.Button("Apply code") | |
| gr.Markdown("## 📊 Coded Segments") | |
| table = gr.Dataframe(interactive=False) | |
| export_btn = gr.Button("Export XLSX") | |
| export_file = gr.File(visible=False) | |
| file_input = gr.File(label="Upload transcript", file_types=[".docx", ".vtt", ".txt"]) | |
| status = gr.Textbox(label="Status", value="Ready") | |
| # --- Callbacks --- | |
| file_input.change( | |
| fn=process_file, | |
| inputs=file_input, | |
| outputs=[transcript_box, file_id, coded_df_state] | |
| ) | |
| add_code_btn.click( | |
| add_new_code, | |
| inputs=[code_input, code_categories_state], | |
| outputs=[code_categories_state] | |
| ) | |
| code_categories_state.change( | |
| lambda codes: gr.update(choices=codes), | |
| inputs=code_categories_state, | |
| outputs=code_dropdown | |
| ) | |
| apply_btn.click( | |
| apply_code, | |
| inputs=[coded_df_state, segment_box, code_dropdown, file_id] + metadata_inputs, | |
| outputs=[coded_df_state, status, segment_box] | |
| ) | |
| coded_df_state.change(lambda df: df, inputs=coded_df_state, outputs=table) | |
| export_btn.click( | |
| export_excel, | |
| inputs=coded_df_state, | |
| outputs=[export_file, status] | |
| ).then( | |
| lambda f: gr.update(visible=f is not None), | |
| inputs=export_file, | |
| outputs=export_file | |
| ) | |
| demo.launch() | |