Spaces:

clementBE
/

transcrib_coder

Sleeping

App Files Files Community

clementBE commited on Nov 25, 2025

Commit

9433a59

verified ·

1 Parent(s): addc2e8

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -24

app.py CHANGED Viewed

@@ -2,13 +2,13 @@ import gradio as gr
 import pandas as pd
 import os
 import io
 # You will need to install python-docx for .docx file support
 try:
     import docx
 except ImportError:
-    print("Warning: 'python-docx' library not found. Install with: pip install python-docx")
-    print("DOCX files will not be supported.")
-    docx = None
 # --- 1. CONFIGURATION ---
@@ -21,7 +21,7 @@ DEFAULT_CODES = [
     "Other: Follow-up Needed",
 ]
-# Define the metadata fields you want to collect
 METADATA_FIELDS = {
     "interview_id": "Interview ID (e.g., I-001)",
     "interview_date": "Date of Interview (YYYY-MM-DD)",
@@ -40,6 +40,7 @@ def read_docx(file_path):
     doc = docx.Document(file_path)
     full_text = []
     for para in doc.paragraphs:
         full_text.append(para.text)
     return '\n'.join(full_text)
@@ -53,12 +54,18 @@ def read_vtt(file_path):
     transcript_lines = []
     for line in lines:
         if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
             transcript_lines.append(line)
     return ' '.join(transcript_lines)
 def process_file(file_obj):
-    """Handles file upload and returns the plain text content."""
     if file_obj is None:
         return "", "No file uploaded.", ""
@@ -77,9 +84,10 @@ def process_file(file_obj):
         except Exception as e:
             return "", f"Error reading file: {e}", ""
-    # Clear the coded data state when a new file is loaded
-    initial_coded_df = pd.DataFrame(columns=["File ID", "Code", "Coded Segment", "Context (100 chars)"])
     return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
 # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
@@ -97,18 +105,27 @@ def apply_code(
     # Check if a segment and code were provided
     if not segment_text or not selected_code:
         return coded_data_df, "⚠️ Please select a text segment and a code."
     # Extract the metadata values from the list
     meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
     # Find context: locate the start of the segment in the full text
     try:
-        start_index = full_text.index(segment_text)
         # Take 100 characters before the segment for context
-        context = full_text[max(0, start_index - 100): start_index]
-        context = '...' + context.replace('\n', ' ')
     except ValueError:
-        context = "Segment not found in transcript (may be due to formatting)."
     # Create the new row
     new_row = {
@@ -120,6 +137,7 @@ def apply_code(
     }
     # Append the new row to the DataFrame
     new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
     return new_df, "✅ Code applied successfully!"
@@ -131,9 +149,11 @@ def generate_excel(coded_data_df):
     output_path = "qualitative_codes.xlsx"
     # Ensure the 'openpyxl' engine is available for XLSX export
-    coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
-    return output_path, "✅ Excel file generated and ready for download."
 # --- 4. GRADIO INTERFACE ---
@@ -150,10 +170,8 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
     current_file_id = gr.State(value="")
     # Stores the full text content of the transcript
     full_transcript_text = gr.State(value="")
-    # Stores the running list of codes
-    coded_data_state = gr.State(
-        value=pd.DataFrame(columns=["File ID", "Code", "Coded Segment", "Context (100 chars)"] + list(METADATA_FIELDS.keys()))
-    )
     # --- A. FILE UPLOAD & METADATA ---
     with gr.Row():
@@ -164,6 +182,8 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
         )
         status_message = gr.Textbox(label="Status", value="Ready", scale=2)
     gr.Interface(
         fn=process_file,
         inputs=file_input,
@@ -171,7 +191,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
         api_name=False,
         live=False,
         # Hide the default UI generated by Interface (we handle it below)
-        allow_flagging="never",
     ).clear()
     gr.Markdown("---")
@@ -187,14 +206,16 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
     # --- B. TRANSCRIPT VIEW ---
     gr.Markdown("## 📖 Transcript")
     # Display the full text (non-interactive so users copy from it)
     transcript_display = gr.Textbox(
         label="Transcript Content (Read-only - Copy segments from here)",
         lines=15,
         interactive=False,
         value="",
     )
-    # Connect the state to the display box
     full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
     gr.Markdown("---")
@@ -224,12 +245,12 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
         interactive=False,
         height=300
     )
-    # Initialize the dataframe display with the state
     coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
     with gr.Row():
         download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
-        download_file = gr.File(label="Download File")
     # --- E. ACTION BINDINGS ---
@@ -252,10 +273,15 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
         fn=generate_excel,
         inputs=coded_data_state,
         outputs=[download_file, status_message]
     )
 # Launch the app
 if __name__ == "__main__":
-    # Note: If running this, you may need to install:
-    # pip install gradio pandas openpyxl python-docx
     demo.launch()

 import pandas as pd
 import os
 import io
 # You will need to install python-docx for .docx file support
 try:
     import docx
 except ImportError:
+    # Set docx to None so the application can still run, but DOCX will be disabled.
+    docx = None
 # --- 1. CONFIGURATION ---
     "Other: Follow-up Needed",
 ]
+# Define the metadata fields you want to collect (key: variable name, value: UI label)
 METADATA_FIELDS = {
     "interview_id": "Interview ID (e.g., I-001)",
     "interview_date": "Date of Interview (YYYY-MM-DD)",
     doc = docx.Document(file_path)
     full_text = []
     for para in doc.paragraphs:
+        # Join paragraphs with a newline for better readability
         full_text.append(para.text)
     return '\n'.join(full_text)
     transcript_lines = []
     for line in lines:
         if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
+            # Add a space at the end of each line to combine them into a single transcript
             transcript_lines.append(line)
     return ' '.join(transcript_lines)
+def get_initial_df():
+    """Returns an empty DataFrame with all necessary columns (metadata + core data)."""
+    core_columns = ["File ID", "Code", "Coded Segment", "Context (100 chars)"]
+    return pd.DataFrame(columns=core_columns + list(METADATA_FIELDS.keys()))
 def process_file(file_obj):
+    """Handles file upload and returns the plain text content and resets state."""
     if file_obj is None:
         return "", "No file uploaded.", ""
         except Exception as e:
             return "", f"Error reading file: {e}", ""
+    # Reset the coded data state when a new file is loaded
+    initial_coded_df = get_initial_df()
+    # The function returns the full text, a status message, the filename (ID), and the reset DataFrame
     return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
 # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
     # Check if a segment and code were provided
     if not segment_text or not selected_code:
         return coded_data_df, "⚠️ Please select a text segment and a code."
+    if not file_id:
+         return coded_data_df, "⚠️ Please upload a file first."
     # Extract the metadata values from the list
+    # metadata_values is a list of strings corresponding to the keys in METADATA_FIELDS
     meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
     # Find context: locate the start of the segment in the full text
+    context = "Context not available (segment match failed)"
     try:
+        # Normalize whitespace to improve matching chances
+        normalized_full_text = ' '.join(full_text.split())
+        normalized_segment = ' '.join(segment_text.split())
+        start_index = normalized_full_text.index(normalized_segment)
         # Take 100 characters before the segment for context
+        context = normalized_full_text[max(0, start_index - 100): start_index]
+        context = '...' + context.strip()
     except ValueError:
+        pass # Keep default error message if segment not found
     # Create the new row
     new_row = {
     }
     # Append the new row to the DataFrame
+    # Note: Use pd.concat for reliable appending in modern pandas
     new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
     return new_df, "✅ Code applied successfully!"
     output_path = "qualitative_codes.xlsx"
     # Ensure the 'openpyxl' engine is available for XLSX export
+    try:
+        coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
+        return output_path, "✅ Excel file generated and ready for download."
+    except Exception as e:
+        return None, f"❌ Error generating Excel file: {e}"
 # --- 4. GRADIO INTERFACE ---
     current_file_id = gr.State(value="")
     # Stores the full text content of the transcript
     full_transcript_text = gr.State(value="")
+    # Stores the running list of codes (DataFrame)
+    coded_data_state = gr.State(value=get_initial_df())
     # --- A. FILE UPLOAD & METADATA ---
     with gr.Row():
         )
         status_message = gr.Textbox(label="Status", value="Ready", scale=2)
+    # Use gr.Interface briefly just to handle the file upload and state update cleanly
+    # NOTE: The 'allow_flagging' argument has been removed to fix the TypeError.
     gr.Interface(
         fn=process_file,
         inputs=file_input,
         api_name=False,
         live=False,
         # Hide the default UI generated by Interface (we handle it below)
     ).clear()
     gr.Markdown("---")
     # --- B. TRANSCRIPT VIEW ---
     gr.Markdown("## 📖 Transcript")
     # Display the full text (non-interactive so users copy from it)
     transcript_display = gr.Textbox(
         label="Transcript Content (Read-only - Copy segments from here)",
         lines=15,
         interactive=False,
         value="",
+        elem_id="transcript-display"
     )
+    # Update the transcript display whenever the hidden state changes
     full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
     gr.Markdown("---")
         interactive=False,
         height=300
     )
+    # Update the dataframe display whenever the hidden state changes
     coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
     with gr.Row():
         download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
+        download_file = gr.File(label="Download File", visible=False) # Keep hidden until ready
     # --- E. ACTION BINDINGS ---
         fn=generate_excel,
         inputs=coded_data_state,
         outputs=[download_file, status_message]
+    ).then(
+        # Make the file visible after generation
+        lambda x: gr.update(visible=True) if x[0] else gr.update(visible=False),
+        inputs=[download_file],
+        outputs=[download_file]
     )
 # Launch the app
 if __name__ == "__main__":
+    # Ensure you have installed: pip install gradio pandas openpyxl python-docx
     demo.launch()