Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,13 +2,13 @@ import gradio as gr
|
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
| 4 |
import io
|
|
|
|
| 5 |
# You will need to install python-docx for .docx file support
|
| 6 |
try:
|
| 7 |
import docx
|
| 8 |
except ImportError:
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
docx = None
|
| 12 |
|
| 13 |
# --- 1. CONFIGURATION ---
|
| 14 |
|
|
@@ -21,7 +21,7 @@ DEFAULT_CODES = [
|
|
| 21 |
"Other: Follow-up Needed",
|
| 22 |
]
|
| 23 |
|
| 24 |
-
# Define the metadata fields you want to collect
|
| 25 |
METADATA_FIELDS = {
|
| 26 |
"interview_id": "Interview ID (e.g., I-001)",
|
| 27 |
"interview_date": "Date of Interview (YYYY-MM-DD)",
|
|
@@ -40,6 +40,7 @@ def read_docx(file_path):
|
|
| 40 |
doc = docx.Document(file_path)
|
| 41 |
full_text = []
|
| 42 |
for para in doc.paragraphs:
|
|
|
|
| 43 |
full_text.append(para.text)
|
| 44 |
return '\n'.join(full_text)
|
| 45 |
|
|
@@ -53,12 +54,18 @@ def read_vtt(file_path):
|
|
| 53 |
transcript_lines = []
|
| 54 |
for line in lines:
|
| 55 |
if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
|
|
|
|
| 56 |
transcript_lines.append(line)
|
| 57 |
|
| 58 |
return ' '.join(transcript_lines)
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
def process_file(file_obj):
|
| 61 |
-
"""Handles file upload and returns the plain text content."""
|
| 62 |
if file_obj is None:
|
| 63 |
return "", "No file uploaded.", ""
|
| 64 |
|
|
@@ -77,9 +84,10 @@ def process_file(file_obj):
|
|
| 77 |
except Exception as e:
|
| 78 |
return "", f"Error reading file: {e}", ""
|
| 79 |
|
| 80 |
-
#
|
| 81 |
-
initial_coded_df =
|
| 82 |
|
|
|
|
| 83 |
return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
|
| 84 |
|
| 85 |
# --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
|
|
@@ -97,18 +105,27 @@ def apply_code(
|
|
| 97 |
# Check if a segment and code were provided
|
| 98 |
if not segment_text or not selected_code:
|
| 99 |
return coded_data_df, "⚠️ Please select a text segment and a code."
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
# Extract the metadata values from the list
|
|
|
|
| 102 |
meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
|
| 103 |
|
| 104 |
# Find context: locate the start of the segment in the full text
|
|
|
|
| 105 |
try:
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Take 100 characters before the segment for context
|
| 108 |
-
context =
|
| 109 |
-
context = '...' + context.
|
| 110 |
except ValueError:
|
| 111 |
-
|
| 112 |
|
| 113 |
# Create the new row
|
| 114 |
new_row = {
|
|
@@ -120,6 +137,7 @@ def apply_code(
|
|
| 120 |
}
|
| 121 |
|
| 122 |
# Append the new row to the DataFrame
|
|
|
|
| 123 |
new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
|
| 124 |
|
| 125 |
return new_df, "✅ Code applied successfully!"
|
|
@@ -131,9 +149,11 @@ def generate_excel(coded_data_df):
|
|
| 131 |
|
| 132 |
output_path = "qualitative_codes.xlsx"
|
| 133 |
# Ensure the 'openpyxl' engine is available for XLSX export
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
| 137 |
|
| 138 |
|
| 139 |
# --- 4. GRADIO INTERFACE ---
|
|
@@ -150,10 +170,8 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
|
|
| 150 |
current_file_id = gr.State(value="")
|
| 151 |
# Stores the full text content of the transcript
|
| 152 |
full_transcript_text = gr.State(value="")
|
| 153 |
-
# Stores the running list of codes
|
| 154 |
-
coded_data_state = gr.State(
|
| 155 |
-
value=pd.DataFrame(columns=["File ID", "Code", "Coded Segment", "Context (100 chars)"] + list(METADATA_FIELDS.keys()))
|
| 156 |
-
)
|
| 157 |
|
| 158 |
# --- A. FILE UPLOAD & METADATA ---
|
| 159 |
with gr.Row():
|
|
@@ -164,6 +182,8 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
|
|
| 164 |
)
|
| 165 |
status_message = gr.Textbox(label="Status", value="Ready", scale=2)
|
| 166 |
|
|
|
|
|
|
|
| 167 |
gr.Interface(
|
| 168 |
fn=process_file,
|
| 169 |
inputs=file_input,
|
|
@@ -171,7 +191,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
|
|
| 171 |
api_name=False,
|
| 172 |
live=False,
|
| 173 |
# Hide the default UI generated by Interface (we handle it below)
|
| 174 |
-
allow_flagging="never",
|
| 175 |
).clear()
|
| 176 |
|
| 177 |
gr.Markdown("---")
|
|
@@ -187,14 +206,16 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
|
|
| 187 |
|
| 188 |
# --- B. TRANSCRIPT VIEW ---
|
| 189 |
gr.Markdown("## 📖 Transcript")
|
|
|
|
| 190 |
# Display the full text (non-interactive so users copy from it)
|
| 191 |
transcript_display = gr.Textbox(
|
| 192 |
label="Transcript Content (Read-only - Copy segments from here)",
|
| 193 |
lines=15,
|
| 194 |
interactive=False,
|
| 195 |
value="",
|
|
|
|
| 196 |
)
|
| 197 |
-
#
|
| 198 |
full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
|
| 199 |
|
| 200 |
gr.Markdown("---")
|
|
@@ -224,12 +245,12 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
|
|
| 224 |
interactive=False,
|
| 225 |
height=300
|
| 226 |
)
|
| 227 |
-
#
|
| 228 |
coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
|
| 229 |
|
| 230 |
with gr.Row():
|
| 231 |
download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
|
| 232 |
-
download_file = gr.File(label="Download File")
|
| 233 |
|
| 234 |
# --- E. ACTION BINDINGS ---
|
| 235 |
|
|
@@ -252,10 +273,15 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
|
|
| 252 |
fn=generate_excel,
|
| 253 |
inputs=coded_data_state,
|
| 254 |
outputs=[download_file, status_message]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
)
|
| 256 |
|
|
|
|
| 257 |
# Launch the app
|
| 258 |
if __name__ == "__main__":
|
| 259 |
-
#
|
| 260 |
-
# pip install gradio pandas openpyxl python-docx
|
| 261 |
demo.launch()
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
| 4 |
import io
|
| 5 |
+
|
| 6 |
# You will need to install python-docx for .docx file support
|
| 7 |
try:
|
| 8 |
import docx
|
| 9 |
except ImportError:
|
| 10 |
+
# Set docx to None so the application can still run, but DOCX will be disabled.
|
| 11 |
+
docx = None
|
|
|
|
| 12 |
|
| 13 |
# --- 1. CONFIGURATION ---
|
| 14 |
|
|
|
|
| 21 |
"Other: Follow-up Needed",
|
| 22 |
]
|
| 23 |
|
| 24 |
+
# Define the metadata fields you want to collect (key: variable name, value: UI label)
|
| 25 |
METADATA_FIELDS = {
|
| 26 |
"interview_id": "Interview ID (e.g., I-001)",
|
| 27 |
"interview_date": "Date of Interview (YYYY-MM-DD)",
|
|
|
|
| 40 |
doc = docx.Document(file_path)
|
| 41 |
full_text = []
|
| 42 |
for para in doc.paragraphs:
|
| 43 |
+
# Join paragraphs with a newline for better readability
|
| 44 |
full_text.append(para.text)
|
| 45 |
return '\n'.join(full_text)
|
| 46 |
|
|
|
|
| 54 |
transcript_lines = []
|
| 55 |
for line in lines:
|
| 56 |
if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
|
| 57 |
+
# Add a space at the end of each line to combine them into a single transcript
|
| 58 |
transcript_lines.append(line)
|
| 59 |
|
| 60 |
return ' '.join(transcript_lines)
|
| 61 |
|
| 62 |
+
def get_initial_df():
|
| 63 |
+
"""Returns an empty DataFrame with all necessary columns (metadata + core data)."""
|
| 64 |
+
core_columns = ["File ID", "Code", "Coded Segment", "Context (100 chars)"]
|
| 65 |
+
return pd.DataFrame(columns=core_columns + list(METADATA_FIELDS.keys()))
|
| 66 |
+
|
| 67 |
def process_file(file_obj):
|
| 68 |
+
"""Handles file upload and returns the plain text content and resets state."""
|
| 69 |
if file_obj is None:
|
| 70 |
return "", "No file uploaded.", ""
|
| 71 |
|
|
|
|
| 84 |
except Exception as e:
|
| 85 |
return "", f"Error reading file: {e}", ""
|
| 86 |
|
| 87 |
+
# Reset the coded data state when a new file is loaded
|
| 88 |
+
initial_coded_df = get_initial_df()
|
| 89 |
|
| 90 |
+
# The function returns the full text, a status message, the filename (ID), and the reset DataFrame
|
| 91 |
return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
|
| 92 |
|
| 93 |
# --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
|
|
|
|
| 105 |
# Check if a segment and code were provided
|
| 106 |
if not segment_text or not selected_code:
|
| 107 |
return coded_data_df, "⚠️ Please select a text segment and a code."
|
| 108 |
+
if not file_id:
|
| 109 |
+
return coded_data_df, "⚠️ Please upload a file first."
|
| 110 |
+
|
| 111 |
|
| 112 |
# Extract the metadata values from the list
|
| 113 |
+
# metadata_values is a list of strings corresponding to the keys in METADATA_FIELDS
|
| 114 |
meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
|
| 115 |
|
| 116 |
# Find context: locate the start of the segment in the full text
|
| 117 |
+
context = "Context not available (segment match failed)"
|
| 118 |
try:
|
| 119 |
+
# Normalize whitespace to improve matching chances
|
| 120 |
+
normalized_full_text = ' '.join(full_text.split())
|
| 121 |
+
normalized_segment = ' '.join(segment_text.split())
|
| 122 |
+
|
| 123 |
+
start_index = normalized_full_text.index(normalized_segment)
|
| 124 |
# Take 100 characters before the segment for context
|
| 125 |
+
context = normalized_full_text[max(0, start_index - 100): start_index]
|
| 126 |
+
context = '...' + context.strip()
|
| 127 |
except ValueError:
|
| 128 |
+
pass # Keep default error message if segment not found
|
| 129 |
|
| 130 |
# Create the new row
|
| 131 |
new_row = {
|
|
|
|
| 137 |
}
|
| 138 |
|
| 139 |
# Append the new row to the DataFrame
|
| 140 |
+
# Note: Use pd.concat for reliable appending in modern pandas
|
| 141 |
new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
|
| 142 |
|
| 143 |
return new_df, "✅ Code applied successfully!"
|
|
|
|
| 149 |
|
| 150 |
output_path = "qualitative_codes.xlsx"
|
| 151 |
# Ensure the 'openpyxl' engine is available for XLSX export
|
| 152 |
+
try:
|
| 153 |
+
coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
|
| 154 |
+
return output_path, "✅ Excel file generated and ready for download."
|
| 155 |
+
except Exception as e:
|
| 156 |
+
return None, f"❌ Error generating Excel file: {e}"
|
| 157 |
|
| 158 |
|
| 159 |
# --- 4. GRADIO INTERFACE ---
|
|
|
|
| 170 |
current_file_id = gr.State(value="")
|
| 171 |
# Stores the full text content of the transcript
|
| 172 |
full_transcript_text = gr.State(value="")
|
| 173 |
+
# Stores the running list of codes (DataFrame)
|
| 174 |
+
coded_data_state = gr.State(value=get_initial_df())
|
|
|
|
|
|
|
| 175 |
|
| 176 |
# --- A. FILE UPLOAD & METADATA ---
|
| 177 |
with gr.Row():
|
|
|
|
| 182 |
)
|
| 183 |
status_message = gr.Textbox(label="Status", value="Ready", scale=2)
|
| 184 |
|
| 185 |
+
# Use gr.Interface briefly just to handle the file upload and state update cleanly
|
| 186 |
+
# NOTE: The 'allow_flagging' argument has been removed to fix the TypeError.
|
| 187 |
gr.Interface(
|
| 188 |
fn=process_file,
|
| 189 |
inputs=file_input,
|
|
|
|
| 191 |
api_name=False,
|
| 192 |
live=False,
|
| 193 |
# Hide the default UI generated by Interface (we handle it below)
|
|
|
|
| 194 |
).clear()
|
| 195 |
|
| 196 |
gr.Markdown("---")
|
|
|
|
| 206 |
|
| 207 |
# --- B. TRANSCRIPT VIEW ---
|
| 208 |
gr.Markdown("## 📖 Transcript")
|
| 209 |
+
|
| 210 |
# Display the full text (non-interactive so users copy from it)
|
| 211 |
transcript_display = gr.Textbox(
|
| 212 |
label="Transcript Content (Read-only - Copy segments from here)",
|
| 213 |
lines=15,
|
| 214 |
interactive=False,
|
| 215 |
value="",
|
| 216 |
+
elem_id="transcript-display"
|
| 217 |
)
|
| 218 |
+
# Update the transcript display whenever the hidden state changes
|
| 219 |
full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
|
| 220 |
|
| 221 |
gr.Markdown("---")
|
|
|
|
| 245 |
interactive=False,
|
| 246 |
height=300
|
| 247 |
)
|
| 248 |
+
# Update the dataframe display whenever the hidden state changes
|
| 249 |
coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
|
| 250 |
|
| 251 |
with gr.Row():
|
| 252 |
download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
|
| 253 |
+
download_file = gr.File(label="Download File", visible=False) # Keep hidden until ready
|
| 254 |
|
| 255 |
# --- E. ACTION BINDINGS ---
|
| 256 |
|
|
|
|
| 273 |
fn=generate_excel,
|
| 274 |
inputs=coded_data_state,
|
| 275 |
outputs=[download_file, status_message]
|
| 276 |
+
).then(
|
| 277 |
+
# Make the file visible after generation
|
| 278 |
+
lambda x: gr.update(visible=True) if x[0] else gr.update(visible=False),
|
| 279 |
+
inputs=[download_file],
|
| 280 |
+
outputs=[download_file]
|
| 281 |
)
|
| 282 |
|
| 283 |
+
|
| 284 |
# Launch the app
|
| 285 |
if __name__ == "__main__":
|
| 286 |
+
# Ensure you have installed: pip install gradio pandas openpyxl python-docx
|
|
|
|
| 287 |
demo.launch()
|