Spaces:

clementBE
/

transcrib_coder

Sleeping

App Files Files Community

clementBE commited on Nov 25, 2025

Commit

211688e

verified ·

1 Parent(s): 96b2741

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -45

app.py CHANGED Viewed

@@ -3,16 +3,13 @@ import pandas as pd
 import os
 import io
-# You will need to install python-docx for .docx file support
 try:
     import docx
 except ImportError:
-    # Set docx to None so the application can still run, but DOCX will be disabled.
     docx = None
 # --- 1. CONFIGURATION ---
-# Define the default codes for qualitative analysis
 DEFAULT_CODES = [
     "Theme: Communication Barrier",
     "Theme: Emotional Support",
@@ -21,7 +18,6 @@ DEFAULT_CODES = [
     "Other: Follow-up Needed",
 ]
-# Define the metadata fields you want to collect (key: variable name, value: UI label)
 METADATA_FIELDS = {
     "interview_id": "Interview ID (e.g., I-001)",
     "interview_date": "Date of Interview (YYYY-MM-DD)",
@@ -33,41 +29,33 @@ METADATA_FIELDS = {
 # --- 2. FILE PROCESSING FUNCTIONS ---
 def read_docx(file_path):
-    """Extracts plain text from a .docx file."""
     if not docx:
         return "Error: python-docx library is not installed. Cannot read .docx."
     doc = docx.Document(file_path)
     full_text = []
     for para in doc.paragraphs:
-        # Join paragraphs with a newline for better readability
         full_text.append(para.text)
     return '\n'.join(full_text)
 def read_vtt(file_path):
-    """Extracts text from a .vtt file (simply ignoring time codes/metadata)."""
     with open(file_path, 'r', encoding='utf-8') as f:
         content = f.read()
-    # Simple heuristic to strip VTT specific lines (WEBVTT, time stamps, blank lines)
     lines = [line.strip() for line in content.split('\n')]
     transcript_lines = []
     for line in lines:
         if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
-            # Add a space at the end of each line to combine them into a single transcript
             transcript_lines.append(line)
     return ' '.join(transcript_lines)
 def get_initial_df():
-    """Returns an empty DataFrame with all necessary columns (metadata + core data)."""
     core_columns = ["File ID", "Code", "Coded Segment", "Context (100 chars)"]
     return pd.DataFrame(columns=core_columns + list(METADATA_FIELDS.keys()))
 def process_file(file_obj):
-    """Handles file upload and returns the plain text content and resets state."""
     if file_obj is None:
-        # Return default values to clear the interface
         return "", "No file uploaded.", "", get_initial_df()
     file_path = file_obj.name
@@ -78,17 +66,14 @@ def process_file(file_obj):
     elif filename.lower().endswith('.vtt'):
         text_content = read_vtt(file_path)
     else:
-        # For simple text files (or as a fallback)
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 text_content = f.read()
         except Exception as e:
             return "", f"Error reading file: {e}", "", get_initial_df()
-    # Reset the coded data state when a new file is loaded
     initial_coded_df = get_initial_df()
-    # The function returns the full text, a status message, the filename (ID), and the reset DataFrame
     return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
 # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
@@ -101,54 +86,41 @@ def apply_code(
     selected_code,
     metadata_values
 ):
-    """Adds a new coded segment and metadata to the DataFrame."""
-    # Check if a segment and code were provided
     if not segment_text or not selected_code:
         return coded_data_df, "⚠️ Please select a text segment and a code."
     if not file_id:
          return coded_data_df, "⚠️ Please upload a file first."
-    # Extract the metadata values from the list
     meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
-    # Find context: locate the start of the segment in the full text
     context = "Context not available (segment match failed)"
     try:
-        # Normalize whitespace to improve matching chances
         normalized_full_text = ' '.join(full_text.split())
         normalized_segment = ' '.join(segment_text.split())
         start_index = normalized_full_text.index(normalized_segment)
-        # Take 100 characters before the segment for context
         context = normalized_full_text[max(0, start_index - 100): start_index]
         context = '...' + context.strip()
     except ValueError:
-        pass # Keep default error message if segment not found
-    # Create the new row
     new_row = {
         "File ID": file_id,
         "Code": selected_code,
         "Coded Segment": segment_text,
         "Context (100 chars)": context,
-        **meta_dict # Add all metadata fields to the row
     }
-    # Append the new row to the DataFrame
-    # Note: Use pd.concat for reliable appending in modern pandas
     new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
     return new_df, "✅ Code applied successfully!"
 def generate_excel(coded_data_df):
-    """Generates and returns the path to the Excel file."""
     if coded_data_df.empty:
         return None, "⚠️ No codes have been applied yet."
     output_path = "qualitative_codes.xlsx"
-    # Ensure the 'openpyxl' engine is available for XLSX export
     try:
         coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
         return output_path, "✅ Excel file generated and ready for download."
@@ -166,11 +138,8 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
     )
     # --- State Management (Hidden) ---
-    # Stores the currently loaded filename
     current_file_id = gr.State(value="")
-    # Stores the full text content of the transcript
     full_transcript_text = gr.State(value="")
-    # Stores the running list of codes (DataFrame)
     coded_data_state = gr.State(value=get_initial_df())
     # *** FIX IMPLEMENTATION: Put all visible UI inside a single Column ***
@@ -185,7 +154,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
             )
             status_message = gr.Textbox(label="Status", value="Ready", scale=2)
-        # File upload change event
         file_input.change(
             fn=process_file,
             inputs=file_input,
@@ -195,7 +163,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
         gr.Markdown("---")
         gr.Markdown("## 📝 Interview Metadata")
-        # Create textboxes for each metadata field
         metadata_inputs = []
         with gr.Row():
             for key, label in METADATA_FIELDS.items():
@@ -206,7 +173,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
         # --- B. TRANSCRIPT VIEW ---
         gr.Markdown("## 📖 Transcript")
-        # Display the full text (non-interactive so users copy from it)
         transcript_display = gr.Textbox(
             label="Transcript Content (Read-only - Copy segments from here)",
             lines=15,
@@ -214,7 +180,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
             value="",
             elem_id="transcript-display"
         )
-        # Update the transcript display whenever the hidden state changes
         full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
         gr.Markdown("---")
@@ -243,16 +208,14 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
             label="Current Coded Segments",
             interactive=False,
         )
-        # Update the dataframe display whenever the hidden state changes
         coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
         with gr.Row():
             download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
-            download_file = gr.File(label="Download File", visible=False) # Keep hidden until ready
         # --- E. ACTION BINDINGS ---
-        # 1. Apply Code Button Logic
         code_btn.click(
             fn=apply_code,
             inputs=[
@@ -261,18 +224,16 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
                 full_transcript_text,
                 segment_input,
                 code_dropdown,
-                gr.List(metadata_inputs) # Pass all metadata inputs as a list
             ],
             outputs=[coded_data_state, status_message]
         )
-        # 2. Download Button Logic
         download_btn.click(
             fn=generate_excel,
             inputs=coded_data_state,
             outputs=[download_file, status_message]
         ).then(
-            # Make the file visible after generation
             lambda x: gr.update(visible=True) if x[0] else gr.update(visible=False),
             inputs=[download_file],
             outputs=[download_file]
@@ -281,5 +242,9 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
 # Launch the app
 if __name__ == "__main__":
-    # Ensure you have installed: pip install gradio pandas openpyxl python-docx
-    demo.launch()

 import os
 import io
 try:
     import docx
 except ImportError:
     docx = None
 # --- 1. CONFIGURATION ---
 DEFAULT_CODES = [
     "Theme: Communication Barrier",
     "Theme: Emotional Support",
     "Other: Follow-up Needed",
 ]
 METADATA_FIELDS = {
     "interview_id": "Interview ID (e.g., I-001)",
     "interview_date": "Date of Interview (YYYY-MM-DD)",
 # --- 2. FILE PROCESSING FUNCTIONS ---
 def read_docx(file_path):
     if not docx:
         return "Error: python-docx library is not installed. Cannot read .docx."
     doc = docx.Document(file_path)
     full_text = []
     for para in doc.paragraphs:
         full_text.append(para.text)
     return '\n'.join(full_text)
 def read_vtt(file_path):
     with open(file_path, 'r', encoding='utf-8') as f:
         content = f.read()
     lines = [line.strip() for line in content.split('\n')]
     transcript_lines = []
     for line in lines:
         if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
             transcript_lines.append(line)
     return ' '.join(transcript_lines)
 def get_initial_df():
     core_columns = ["File ID", "Code", "Coded Segment", "Context (100 chars)"]
     return pd.DataFrame(columns=core_columns + list(METADATA_FIELDS.keys()))
 def process_file(file_obj):
     if file_obj is None:
         return "", "No file uploaded.", "", get_initial_df()
     file_path = file_obj.name
     elif filename.lower().endswith('.vtt'):
         text_content = read_vtt(file_path)
     else:
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 text_content = f.read()
         except Exception as e:
             return "", f"Error reading file: {e}", "", get_initial_df()
     initial_coded_df = get_initial_df()
     return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
 # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
     selected_code,
     metadata_values
 ):
     if not segment_text or not selected_code:
         return coded_data_df, "⚠️ Please select a text segment and a code."
     if not file_id:
          return coded_data_df, "⚠️ Please upload a file first."
     meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
     context = "Context not available (segment match failed)"
     try:
         normalized_full_text = ' '.join(full_text.split())
         normalized_segment = ' '.join(segment_text.split())
         start_index = normalized_full_text.index(normalized_segment)
         context = normalized_full_text[max(0, start_index - 100): start_index]
         context = '...' + context.strip()
     except ValueError:
+        pass
     new_row = {
         "File ID": file_id,
         "Code": selected_code,
         "Coded Segment": segment_text,
         "Context (100 chars)": context,
+        **meta_dict
     }
     new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
     return new_df, "✅ Code applied successfully!"
 def generate_excel(coded_data_df):
     if coded_data_df.empty:
         return None, "⚠️ No codes have been applied yet."
     output_path = "qualitative_codes.xlsx"
     try:
         coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
         return output_path, "✅ Excel file generated and ready for download."
     )
     # --- State Management (Hidden) ---
     current_file_id = gr.State(value="")
     full_transcript_text = gr.State(value="")
     coded_data_state = gr.State(value=get_initial_df())
     # *** FIX IMPLEMENTATION: Put all visible UI inside a single Column ***
             )
             status_message = gr.Textbox(label="Status", value="Ready", scale=2)
         file_input.change(
             fn=process_file,
             inputs=file_input,
         gr.Markdown("---")
         gr.Markdown("## 📝 Interview Metadata")
         metadata_inputs = []
         with gr.Row():
             for key, label in METADATA_FIELDS.items():
         # --- B. TRANSCRIPT VIEW ---
         gr.Markdown("## 📖 Transcript")
         transcript_display = gr.Textbox(
             label="Transcript Content (Read-only - Copy segments from here)",
             lines=15,
             value="",
             elem_id="transcript-display"
         )
         full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
         gr.Markdown("---")
             label="Current Coded Segments",
             interactive=False,
         )
         coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
         with gr.Row():
             download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
+            download_file = gr.File(label="Download File", visible=False)
         # --- E. ACTION BINDINGS ---
         code_btn.click(
             fn=apply_code,
             inputs=[
                 full_transcript_text,
                 segment_input,
                 code_dropdown,
+                gr.List(metadata_inputs)
             ],
             outputs=[coded_data_state, status_message]
         )
         download_btn.click(
             fn=generate_excel,
             inputs=coded_data_state,
             outputs=[download_file, status_message]
         ).then(
             lambda x: gr.update(visible=True) if x[0] else gr.update(visible=False),
             inputs=[download_file],
             outputs=[download_file]
 # Launch the app
 if __name__ == "__main__":
+    # Use explicit server settings to ensure wide compatibility
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )