Spaces:

clementBE
/

transcrib_coder

Sleeping

App Files Files Community

clementBE commited on Nov 25, 2025

Commit

20a147b

verified ·

1 Parent(s): 8e98afb

Create app.py

Browse files

Files changed (1) hide show

app.py +261 -0

app.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import gradio as gr
+import pandas as pd
+import os
+import io
+# You will need to install python-docx for .docx file support
+try:
+    import docx
+except ImportError:
+    print("Warning: 'python-docx' library not found. Install with: pip install python-docx")
+    print("DOCX files will not be supported.")
+    docx = None
+# --- 1. CONFIGURATION ---
+# Define the default codes for qualitative analysis
+DEFAULT_CODES = [
+    "Theme: Communication Barrier",
+    "Theme: Emotional Support",
+    "Theme: Future Aspirations",
+    "Theme: Financial Stress",
+    "Other: Follow-up Needed",
+]
+# Define the metadata fields you want to collect
+METADATA_FIELDS = {
+    "interview_id": "Interview ID (e.g., I-001)",
+    "interview_date": "Date of Interview (YYYY-MM-DD)",
+    "occupation": "Participant Occupation",
+    "age": "Participant Age",
+}
+# --- 2. FILE PROCESSING FUNCTIONS ---
+def read_docx(file_path):
+    """Extracts plain text from a .docx file."""
+    if not docx:
+        return "Error: python-docx library is not installed. Cannot read .docx."
+    doc = docx.Document(file_path)
+    full_text = []
+    for para in doc.paragraphs:
+        full_text.append(para.text)
+    return '\n'.join(full_text)
+def read_vtt(file_path):
+    """Extracts text from a .vtt file (simply ignoring time codes/metadata)."""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+    # Simple heuristic to strip VTT specific lines (WEBVTT, time stamps, blank lines)
+    lines = [line.strip() for line in content.split('\n')]
+    transcript_lines = []
+    for line in lines:
+        if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
+            transcript_lines.append(line)
+    return ' '.join(transcript_lines)
+def process_file(file_obj):
+    """Handles file upload and returns the plain text content."""
+    if file_obj is None:
+        return "", "No file uploaded.", ""
+    file_path = file_obj.name
+    filename = os.path.basename(file_path)
+    if filename.lower().endswith('.docx'):
+        text_content = read_docx(file_path)
+    elif filename.lower().endswith('.vtt'):
+        text_content = read_vtt(file_path)
+    else:
+        # For simple text files (or as a fallback)
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text_content = f.read()
+        except Exception as e:
+            return "", f"Error reading file: {e}", ""
+    # Clear the coded data state when a new file is loaded
+    initial_coded_df = pd.DataFrame(columns=["File ID", "Code", "Coded Segment", "Context (100 chars)"])
+    return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
+# --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
+def apply_code(
+    coded_data_df,
+    file_id,
+    full_text,
+    segment_text,
+    selected_code,
+    metadata_values
+):
+    """Adds a new coded segment and metadata to the DataFrame."""
+    # Check if a segment and code were provided
+    if not segment_text or not selected_code:
+        return coded_data_df, "⚠️ Please select a text segment and a code."
+    # Extract the metadata values from the list
+    meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
+    # Find context: locate the start of the segment in the full text
+    try:
+        start_index = full_text.index(segment_text)
+        # Take 100 characters before the segment for context
+        context = full_text[max(0, start_index - 100): start_index]
+        context = '...' + context.replace('\n', ' ')
+    except ValueError:
+        context = "Segment not found in transcript (may be due to formatting)."
+    # Create the new row
+    new_row = {
+        "File ID": file_id,
+        "Code": selected_code,
+        "Coded Segment": segment_text,
+        "Context (100 chars)": context,
+        **meta_dict # Add all metadata fields to the row
+    }
+    # Append the new row to the DataFrame
+    new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
+    return new_df, "✅ Code applied successfully!"
+def generate_excel(coded_data_df):
+    """Generates and returns the path to the Excel file."""
+    if coded_data_df.empty:
+        return None, "⚠️ No codes have been applied yet."
+    output_path = "qualitative_codes.xlsx"
+    # Ensure the 'openpyxl' engine is available for XLSX export
+    coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
+    return output_path, "✅ Excel file generated and ready for download."
+# --- 4. GRADIO INTERFACE ---
+with gr.Blocks(title="Qualitative Coding Interface") as demo:
+    gr.Markdown("# 📑 Qualitative Coding Interface")
+    gr.Markdown(
+        "Upload a `.docx`, `.vtt`, or `.txt` transcript, add interview metadata, and then "
+        "copy text segments from the transcript box to the 'Segment to Code' box below to apply tags."
+    )
+    # --- State Management (Hidden) ---
+    # Stores the currently loaded filename
+    current_file_id = gr.State(value="")
+    # Stores the full text content of the transcript
+    full_transcript_text = gr.State(value="")
+    # Stores the running list of codes
+    coded_data_state = gr.State(
+        value=pd.DataFrame(columns=["File ID", "Code", "Coded Segment", "Context (100 chars)"] + list(METADATA_FIELDS.keys()))
+    )
+    # --- A. FILE UPLOAD & METADATA ---
+    with gr.Row():
+        file_input = gr.File(
+            label="Upload Transcript (.docx, .vtt, .txt)",
+            file_types=[".docx", ".vtt", ".txt"],
+            scale=1
+        )
+        status_message = gr.Textbox(label="Status", value="Ready", scale=2)
+    gr.Interface(
+        fn=process_file,
+        inputs=file_input,
+        outputs=[full_transcript_text, status_message, current_file_id, coded_data_state],
+        api_name=False,
+        live=False,
+        # Hide the default UI generated by Interface (we handle it below)
+        allow_flagging="never",
+    ).clear()
+    gr.Markdown("---")
+    gr.Markdown("## 📝 Interview Metadata")
+    # Create textboxes for each metadata field
+    metadata_inputs = []
+    with gr.Row():
+        for key, label in METADATA_FIELDS.items():
+            metadata_inputs.append(gr.Textbox(label=label, value="", max_lines=1, interactive=True))
+    gr.Markdown("---")
+    # --- B. TRANSCRIPT VIEW ---
+    gr.Markdown("## 📖 Transcript")
+    # Display the full text (non-interactive so users copy from it)
+    transcript_display = gr.Textbox(
+        label="Transcript Content (Read-only - Copy segments from here)",
+        lines=15,
+        interactive=False,
+        value="",
+    )
+    # Connect the state to the display box
+    full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
+    gr.Markdown("---")
+    # --- C. CODING/TAGGING CONTROLS ---
+    gr.Markdown("## 🏷️ Apply Code")
+    with gr.Row():
+        segment_input = gr.Textbox(
+            label="Segment to Code (Paste the text you copied from above)",
+            lines=3,
+            scale=3
+        )
+        code_dropdown = gr.Dropdown(
+            label="Select Code/Tag",
+            choices=DEFAULT_CODES,
+            scale=1
+        )
+    code_btn = gr.Button("Apply Code & Save Segment", variant="primary")
+    # --- D. CODED DATA & DOWNLOAD ---
+    gr.Markdown("---")
+    gr.Markdown("## 📊 Coded Data")
+    coded_output_df = gr.Dataframe(
+        label="Current Coded Segments",
+        interactive=False,
+        height=300
+    )
+    # Initialize the dataframe display with the state
+    coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
+    with gr.Row():
+        download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
+        download_file = gr.File(label="Download File")
+    # --- E. ACTION BINDINGS ---
+    # 1. Apply Code Button Logic
+    code_btn.click(
+        fn=apply_code,
+        inputs=[
+            coded_data_state,
+            current_file_id,
+            full_transcript_text,
+            segment_input,
+            code_dropdown,
+            gr.List(metadata_inputs) # Pass all metadata inputs as a list
+        ],
+        outputs=[coded_data_state, status_message]
+    )
+    # 2. Download Button Logic
+    download_btn.click(
+        fn=generate_excel,
+        inputs=coded_data_state,
+        outputs=[download_file, status_message]
+    )
+# Launch the app
+if __name__ == "__main__":
+    # Note: If running this, you may need to install:
+    # pip install gradio pandas openpyxl python-docx
+    demo.launch()