Spaces:

clementBE
/

transcrib_coder

Sleeping

App Files Files Community

clementBE commited on Nov 25, 2025

Commit

2aa6081

verified ·

1 Parent(s): 211688e

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -189

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import gradio as gr
 import pandas as pd
 import os
-import io
 try:
     import docx
 except ImportError:
-    docx = None
-# --- 1. CONFIGURATION ---
 DEFAULT_CODES = [
     "Theme: Communication Barrier",
@@ -26,33 +27,38 @@ METADATA_FIELDS = {
 }
-# --- 2. FILE PROCESSING FUNCTIONS ---
 def read_docx(file_path):
     if not docx:
-        return "Error: python-docx library is not installed. Cannot read .docx."
-    doc = docx.Document(file_path)
-    full_text = []
-    for para in doc.paragraphs:
-        full_text.append(para.text)
-    return '\n'.join(full_text)
 def read_vtt(file_path):
-    with open(file_path, 'r', encoding='utf-8') as f:
         content = f.read()
-    lines = [line.strip() for line in content.split('\n')]
-    transcript_lines = []
-    for line in lines:
-        if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
-            transcript_lines.append(line)
-    return ' '.join(transcript_lines)
 def get_initial_df():
-    core_columns = ["File ID", "Code", "Coded Segment", "Context (100 chars)"]
-    return pd.DataFrame(columns=core_columns + list(METADATA_FIELDS.keys()))
 def process_file(file_obj):
     if file_obj is None:
@@ -60,191 +66,150 @@ def process_file(file_obj):
     file_path = file_obj.name
     filename = os.path.basename(file_path)
-    if filename.lower().endswith('.docx'):
-        text_content = read_docx(file_path)
-    elif filename.lower().endswith('.vtt'):
-        text_content = read_vtt(file_path)
     else:
-        try:
-            with open(file_path, 'r', encoding='utf-8') as f:
-                text_content = f.read()
-        except Exception as e:
-            return "", f"Error reading file: {e}", "", get_initial_df()
-    initial_coded_df = get_initial_df()
-    return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
-# --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
-def apply_code(
-    coded_data_df,
-    file_id,
-    full_text,
-    segment_text,
-    selected_code,
-    metadata_values
-):
     if not segment_text or not selected_code:
-        return coded_data_df, "⚠️ Please select a text segment and a code."
-    if not file_id:
-         return coded_data_df, "⚠️ Please upload a file first."
     meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
-    context = "Context not available (segment match failed)"
     try:
-        normalized_full_text = ' '.join(full_text.split())
-        normalized_segment = ' '.join(segment_text.split())
-        start_index = normalized_full_text.index(normalized_segment)
-        context = normalized_full_text[max(0, start_index - 100): start_index]
-        context = '...' + context.strip()
-    except ValueError:
-        pass
     new_row = {
         "File ID": file_id,
         "Code": selected_code,
         "Coded Segment": segment_text,
         "Context (100 chars)": context,
-        **meta_dict
     }
-    new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
-    return new_df, "✅ Code applied successfully!"
-def generate_excel(coded_data_df):
-    if coded_data_df.empty:
-        return None, "⚠️ No codes have been applied yet."
-    output_path = "qualitative_codes.xlsx"
-    try:
-        coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
-        return output_path, "✅ Excel file generated and ready for download."
-    except Exception as e:
-        return None, f"❌ Error generating Excel file: {e}"
-# --- 4. GRADIO INTERFACE ---
 with gr.Blocks(title="Qualitative Coding Interface") as demo:
     gr.Markdown("# 📑 Qualitative Coding Interface")
-    gr.Markdown(
-        "Upload a `.docx`, `.vtt`, or `.txt` transcript, add interview metadata, and then "
-        "copy text segments from the transcript box to the 'Segment to Code' box below to apply tags."
     )
-    # --- State Management (Hidden) ---
-    current_file_id = gr.State(value="")
-    full_transcript_text = gr.State(value="")
-    coded_data_state = gr.State(value=get_initial_df())
-    # *** FIX IMPLEMENTATION: Put all visible UI inside a single Column ***
-    with gr.Column():
-        # --- A. FILE UPLOAD & METADATA ---
-        with gr.Row():
-            file_input = gr.File(
-                label="Upload Transcript (.docx, .vtt, .txt)",
-                file_types=[".docx", ".vtt", ".txt"],
-                scale=1
-            )
-            status_message = gr.Textbox(label="Status", value="Ready", scale=2)
-        file_input.change(
-            fn=process_file,
-            inputs=file_input,
-            outputs=[full_transcript_text, status_message, current_file_id, coded_data_state]
-        )
-        gr.Markdown("---")
-        gr.Markdown("## 📝 Interview Metadata")
-        metadata_inputs = []
-        with gr.Row():
-            for key, label in METADATA_FIELDS.items():
-                metadata_inputs.append(gr.Textbox(label=label, value="", max_lines=1, interactive=True))
-        gr.Markdown("---")
-        # --- B. TRANSCRIPT VIEW ---
-        gr.Markdown("## 📖 Transcript")
-        transcript_display = gr.Textbox(
-            label="Transcript Content (Read-only - Copy segments from here)",
-            lines=15,
-            interactive=False,
-            value="",
-            elem_id="transcript-display"
-        )
-        full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
-        gr.Markdown("---")
-        # --- C. CODING/TAGGING CONTROLS ---
-        gr.Markdown("## 🏷️ Apply Code")
-        with gr.Row():
-            segment_input = gr.Textbox(
-                label="Segment to Code (Paste the text you copied from above)",
-                lines=3,
-                scale=3
-            )
-            code_dropdown = gr.Dropdown(
-                label="Select Code/Tag",
-                choices=DEFAULT_CODES,
-                scale=1
-            )
-        code_btn = gr.Button("Apply Code & Save Segment", variant="primary")
-        # --- D. CODED DATA & DOWNLOAD ---
-        gr.Markdown("---")
-        gr.Markdown("## 📊 Coded Data")
-        coded_output_df = gr.Dataframe(
-            label="Current Coded Segments",
-            interactive=False,
-        )
-        coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
-        with gr.Row():
-            download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
-            download_file = gr.File(label="Download File", visible=False)
-        # --- E. ACTION BINDINGS ---
-        code_btn.click(
-            fn=apply_code,
-            inputs=[
-                coded_data_state,
-                current_file_id,
-                full_transcript_text,
-                segment_input,
-                code_dropdown,
-                gr.List(metadata_inputs)
-            ],
-            outputs=[coded_data_state, status_message]
-        )
-        download_btn.click(
-            fn=generate_excel,
-            inputs=coded_data_state,
-            outputs=[download_file, status_message]
-        ).then(
-            lambda x: gr.update(visible=True) if x[0] else gr.update(visible=False),
-            inputs=[download_file],
-            outputs=[download_file]
-        )
-# Launch the app
 if __name__ == "__main__":
-    # Use explicit server settings to ensure wide compatibility
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 import gradio as gr
 import pandas as pd
 import os
 try:
     import docx
 except ImportError:
+    docx = None
+# ----------------------------
+# CONFIG
+# ----------------------------
 DEFAULT_CODES = [
     "Theme: Communication Barrier",
 }
+# ----------------------------
+# FILE READERS
+# ----------------------------
 def read_docx(file_path):
     if not docx:
+        return "Error: python-docx is not installed."
+    document = docx.Document(file_path)
+    return "\n".join([p.text for p in document.paragraphs])
 def read_vtt(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
         content = f.read()
+    lines = [l.strip() for l in content.split("\n")]
+    transcript = [
+        l for l in lines
+        if l and not l.startswith("WEBVTT") and "-->" not in l and not l.isdigit()
+    ]
+    return " ".join(transcript)
 def get_initial_df():
+    cols = ["File ID", "Code", "Coded Segment", "Context (100 chars)"] + list(METADATA_FIELDS.keys())
+    return pd.DataFrame(columns=cols)
+# ----------------------------
+# FILE PROCESSING
+# ----------------------------
 def process_file(file_obj):
     if file_obj is None:
     file_path = file_obj.name
     filename = os.path.basename(file_path)
+    if filename.lower().endswith(".docx"):
+        text = read_docx(file_path)
+    elif filename.lower().endswith(".vtt"):
+        text = read_vtt(file_path)
     else:
+        with open(file_path, "r", encoding="utf-8") as f:
+            text = f.read()
+    return text, f"Loaded: {filename}", filename, get_initial_df()
+# ----------------------------
+# CODING FUNCTION
+# ----------------------------
+def apply_code(coded_df, file_id, full_text, segment_text, selected_code, *metadata_values):
     if not segment_text or not selected_code:
+        return coded_df, "Please enter a segment and select a code."
     meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
+    # Extract simple context
+    context = "Context unavailable"
     try:
+        norm_full = " ".join(full_text.split())
+        norm_seg = " ".join(segment_text.split())
+        idx = norm_full.index(norm_seg)
+        context = "..." + norm_full[max(0, idx - 100):idx]
+    except:
+        pass
     new_row = {
         "File ID": file_id,
         "Code": selected_code,
         "Coded Segment": segment_text,
         "Context (100 chars)": context,
+        **meta_dict
     }
+    new_df = pd.concat([coded_df, pd.DataFrame([new_row])], ignore_index=True)
+    return new_df, "Code applied!"
+# ----------------------------
+# EXPORT EXCEL
+# ----------------------------
+def generate_excel(coded_df):
+    if coded_df.empty:
+        return None, "No coded segments yet."
+    out_path = "qualitative_codes.xlsx"
+    coded_df.to_excel(out_path, index=False)
+    return out_path, "Excel ready for download."
+# ----------------------------
+# GRADIO UI
+# ----------------------------
 with gr.Blocks(title="Qualitative Coding Interface") as demo:
     gr.Markdown("# 📑 Qualitative Coding Interface")
+    gr.Markdown("Load transcripts → add metadata → code text segments → export Excel.")
+    # Hidden states
+    current_file_id = gr.State("")
+    full_text_state = gr.State("")
+    coded_state = gr.State(get_initial_df())
+    # -----------------------
+    # File upload
+    # -----------------------
+    with gr.Row():
+        file_input = gr.File(label="Upload (.docx, .vtt, .txt)", file_types=[".docx", ".vtt", ".txt"])
+        status = gr.Textbox(label="Status", value="Ready")
+    file_input.change(
+        fn=process_file,
+        inputs=file_input,
+        outputs=[full_text_state, status, current_file_id, coded_state]
     )
+    # -----------------------
+    # Metadata
+    ------------------------
+    gr.Markdown("## 📝 Interview Metadata")
+    metadata_inputs = []
+    with gr.Row():
+        for key, lbl in METADATA_FIELDS.items():
+            box = gr.Textbox(label=lbl)
+            metadata_inputs.append(box)
+    # -----------------------
+    # Transcript
+    # -----------------------
+    gr.Markdown("## 📖 Transcript")
+    transcript_box = gr.Textbox(label="Transcript (read-only)", lines=15, interactive=False)
+    full_text_state.change(lambda x: x, inputs=full_text_state, outputs=transcript_box)
+    # -----------------------
+    # Coding controls
+    # -----------------------
+    gr.Markdown("## 🏷️ Apply Code")
+    with gr.Row():
+        segment_input = gr.Textbox(label="Paste segment", lines=3)
+        code_choice = gr.Dropdown(label="Select Code", choices=DEFAULT_CODES)
+    code_btn = gr.Button("Apply Code")
+    code_btn.click(
+        fn=apply_code,
+        inputs=[coded_state, current_file_id, full_text_state, segment_input, code_choice] + metadata_inputs,
+        outputs=[coded_state, status]
+    )
+    # -----------------------
+    # Show Data
+    # -----------------------
+    gr.Markdown("## 📊 Coded Data")
+    data_table = gr.Dataframe(interactive=False)
+    coded_state.change(lambda x: x, inputs=coded_state, outputs=data_table)
+    # -----------------------
+    # Download Excel
+    # -----------------------
+    download_btn = gr.Button("Download XLSX")
+    download_file = gr.File(label="Download", visible=False)
+    def show_file(file):
+        return gr.update(visible=file is not None)
+    download_btn.click(
+        generate_excel,
+        inputs=coded_state,
+        outputs=[download_file, status]
+    ).then(
+        show_file,
+        inputs=download_file,
+        outputs=download_file
+    )
+# Launch for HF Spaces
 if __name__ == "__main__":
+    demo.launch()