Spaces:

clementBE
/

transcrib_coder

Sleeping

App Files Files Community

clementBE commited on Nov 25, 2025

Commit

af98792

verified ·

1 Parent(s): 4741434

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -24

app.py CHANGED Viewed

@@ -1,49 +1,196 @@
 import gradio as gr
 import pandas as pd
-# Default codes
-CODES = ["Theme: Communication Barrier","Theme: Emotional Support"]
-# State
-coded_df_state = pd.DataFrame(columns=["Segment","Code"])
-# Build transcript HTML with JS to store selection
-def build_transcript_html(text):
     html = f"""
-    <div id="transcript" style='white-space: pre-wrap; border:1px solid #ccc; padding:5px; max-height:400px; overflow:auto;'>{text}</div>
     <script>
-    const transcript = document.getElementById("transcript");
-    transcript.addEventListener("mouseup", function() {{
         const sel = window.getSelection().toString();
         if(sel.length>0){{
-            document.querySelector("#selected_segment").value = sel;
         }}
     }});
     </script>
     """
     return html
-# Apply code to selected segment
-def apply_code(selected_segment, code, df):
-    if not selected_segment or not code:
-        return df, "⚠️ Select segment and code first"
-    new_row = {"Segment": selected_segment, "Code": code}
     df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
-    return df, f"✅ Segment coded as {code}"
-# Gradio interface
 with gr.Blocks() as demo:
-    transcript_text = "This is a sample transcript. You can select any part of this text to code it."
     with gr.Row():
         with gr.Column(scale=3):
-            transcript_html = gr.HTML(build_transcript_html(transcript_text))
         with gr.Column(scale=2):
-            selected_segment = gr.Textbox(label="Selected Segment", interactive=False, elem_id="selected_segment")
-            code_dropdown = gr.Dropdown(label="Select Code", choices=CODES)
-            code_btn = gr.Button("Apply Code")
-            coded_table = gr.Dataframe(headers=["Segment","Code"])
-    code_btn.click(apply_code, inputs=[selected_segment, code_dropdown, coded_table], outputs=[coded_table, gr.Textbox(label="Status")])
 demo.launch()

 import gradio as gr
 import pandas as pd
+import os
+try:
+    import docx
+except ImportError:
+    docx = None
+# ------------------------------
+# CONFIG
+# ------------------------------
+CODES = [
+    "Communication Barrier",
+    "Emotional Support",
+    "Future Aspirations",
+    "Financial Stress",
+    "Follow-up Needed",
+]
+METADATA_FIELDS = {
+    "interview_id": "Interview ID",
+    "interview_date": "Interview Date",
+    "occupation": "Occupation",
+    "age": "Age",
+}
+COLOR_MAP = {
+    "Communication Barrier": "lightblue",
+    "Emotional Support": "lightgreen",
+    "Future Aspirations": "khaki",
+    "Financial Stress": "lightpink",
+    "Follow-up Needed": "orange",
+}
+# ------------------------------
+# FILE PROCESSING
+# ------------------------------
+def read_docx(path):
+    if not docx:
+        return "Error: python-docx not installed."
+    d = docx.Document(path)
+    return "\n".join([p.text for p in d.paragraphs])
+def read_vtt(path):
+    with open(path, "r", encoding="utf-8") as f:
+        lines = f.read().split("\n")
+    cleaned = [
+        l.strip()
+        for l in lines
+        if l and "WEBVTT" not in l and "-->" not in l and not l.strip().isdigit()
+    ]
+    return " ".join(cleaned)
+def get_empty_df():
+    return pd.DataFrame(
+        columns=["File ID", "Coded Segment", "Code"] + list(METADATA_FIELDS.keys())
+    )
+def process_file(file_obj):
+    if file_obj is None:
+        return "", "", get_empty_df()
+    path = file_obj.name
+    name = os.path.basename(path)
+    if name.lower().endswith(".docx"):
+        text = read_docx(path)
+    elif name.lower().endswith(".vtt"):
+        text = read_vtt(path)
+    else:
+        with open(path, "r", encoding="utf-8") as f:
+            text = f.read()
+    return text, name, get_empty_df()
+# ------------------------------
+# BUILD TRANSCRIPT HTML
+# ------------------------------
+def build_transcript_html(text, df):
+    display_text = text
+    if df is not None and not df.empty:
+        for _, row in df.iterrows():
+            seg = row["Coded Segment"]
+            color = COLOR_MAP.get(row["Code"], "yellow")
+            display_text = display_text.replace(seg, f"<span style='background-color:{color}'>{seg}</span>", 1)
+    safe_text = display_text.replace("\n", "<br>")
     html = f"""
+    <div id='transcript' style='white-space: pre-wrap; font-size:16px; line-height:1.5; max-height:600px; overflow:auto; border:1px solid #ccc; padding:5px;'>
+        {safe_text}
+    </div>
     <script>
+    const transcript = document.getElementById('transcript');
+    transcript.addEventListener('mouseup', function() {{
         const sel = window.getSelection().toString();
         if(sel.length>0){{
+            // store in hidden input
+            const state_input = document.querySelector('#selected_segment_state');
+            if(state_input) {{
+                state_input.value = sel;
+                state_input.dispatchEvent(new Event("input",{ {bubbles:true} }));
+            }}
         }}
     }});
     </script>
     """
     return html
+# ------------------------------
+# APPLY CODE
+# ------------------------------
+def apply_code(df, segment, code, file_id, *metadata_values):
+    if not segment or not code or not file_id:
+        return df, "⚠️ Select text and file first"
+    meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
+    new_row = {"File ID": file_id, "Coded Segment": segment, "Code": code, **meta_dict}
     df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
+    return df, f"✅ Segment coded as '{code}'"
+# ------------------------------
+# EXPORT XLSX
+# ------------------------------
+def export_excel(df):
+    if df.empty:
+        return None, "Nothing to export"
+    path = "coded_segments.xlsx"
+    df.to_excel(path, index=False)
+    return path, "Excel ready"
+# ------------------------------
+# GRADIO UI
+# ------------------------------
 with gr.Blocks() as demo:
+    # States
+    full_text = gr.State("")
+    file_id = gr.State("")
+    coded_df_state = gr.State(get_empty_df())
+    selected_segment_state = gr.State("")
+    # ---------------- Metadata Top ----------------
     with gr.Row():
+        metadata_inputs = []
+        for k,lbl in METADATA_FIELDS.items():
+            metadata_inputs.append(gr.Textbox(label=lbl))
+    # ---------------- Transcript + Coding ----------------
+    with gr.Row():
+        # Left: transcript
         with gr.Column(scale=3):
+            transcript_html = gr.HTML()
+            # Hidden state to store selected segment
+            selected_segment = gr.Textbox(label="Selected segment (hidden)", interactive=False, visible=False, elem_id="selected_segment_state")
+        # Right: code buttons + table
         with gr.Column(scale=2):
+            gr.Markdown("## 🏷️ Code Categories")
+            code_buttons = []
+            for c in CODES:
+                btn = gr.Button(c)
+                code_buttons.append(btn)
+            gr.Markdown("## 📊 Coded Segments")
+            table = gr.Dataframe(interactive=False)
+            export_btn = gr.Button("Export XLSX")
+            export_file = gr.File(visible=False)
+            file_input = gr.File(label="Upload transcript", file_types=[".docx",".vtt",".txt"])
+            status = gr.Textbox(label="Status", value="Ready")
+    # ---------------- Callbacks ----------------
+    # Load file
+    file_input.change(fn=process_file, inputs=file_input, outputs=[full_text, file_id, coded_df_state])
+    # Update transcript HTML
+    def update_transcript(text, df):
+        return build_transcript_html(text, df)
+    full_text.change(update_transcript, inputs=[full_text, coded_df_state], outputs=transcript_html)
+    coded_df_state.change(update_transcript, inputs=[full_text, coded_df_state], outputs=transcript_html)
+    # Apply code buttons
+    for btn, code_name in zip(code_buttons, CODES):
+        btn.click(
+            apply_code,
+            inputs=[coded_df_state, selected_segment_state, gr.State(code_name), file_id] + metadata_inputs,
+            outputs=[coded_df_state, status]
+        )
+    # Update table
+    coded_df_state.change(lambda df: df, inputs=coded_df_state, outputs=table)
+    # Export
+    export_btn.click(export_excel, inputs=coded_df_state, outputs=[export_file, status]).then(
+        lambda f: gr.update(visible=f is not None),
+        inputs=export_file,
+        outputs=export_file
+    )
 demo.launch()