clementBE commited on
Commit
9433a59
·
verified ·
1 Parent(s): addc2e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -24
app.py CHANGED
@@ -2,13 +2,13 @@ import gradio as gr
2
  import pandas as pd
3
  import os
4
  import io
 
5
  # You will need to install python-docx for .docx file support
6
  try:
7
  import docx
8
  except ImportError:
9
- print("Warning: 'python-docx' library not found. Install with: pip install python-docx")
10
- print("DOCX files will not be supported.")
11
- docx = None
12
 
13
  # --- 1. CONFIGURATION ---
14
 
@@ -21,7 +21,7 @@ DEFAULT_CODES = [
21
  "Other: Follow-up Needed",
22
  ]
23
 
24
- # Define the metadata fields you want to collect
25
  METADATA_FIELDS = {
26
  "interview_id": "Interview ID (e.g., I-001)",
27
  "interview_date": "Date of Interview (YYYY-MM-DD)",
@@ -40,6 +40,7 @@ def read_docx(file_path):
40
  doc = docx.Document(file_path)
41
  full_text = []
42
  for para in doc.paragraphs:
 
43
  full_text.append(para.text)
44
  return '\n'.join(full_text)
45
 
@@ -53,12 +54,18 @@ def read_vtt(file_path):
53
  transcript_lines = []
54
  for line in lines:
55
  if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
 
56
  transcript_lines.append(line)
57
 
58
  return ' '.join(transcript_lines)
59
 
 
 
 
 
 
60
  def process_file(file_obj):
61
- """Handles file upload and returns the plain text content."""
62
  if file_obj is None:
63
  return "", "No file uploaded.", ""
64
 
@@ -77,9 +84,10 @@ def process_file(file_obj):
77
  except Exception as e:
78
  return "", f"Error reading file: {e}", ""
79
 
80
- # Clear the coded data state when a new file is loaded
81
- initial_coded_df = pd.DataFrame(columns=["File ID", "Code", "Coded Segment", "Context (100 chars)"])
82
 
 
83
  return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
84
 
85
  # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
@@ -97,18 +105,27 @@ def apply_code(
97
  # Check if a segment and code were provided
98
  if not segment_text or not selected_code:
99
  return coded_data_df, "⚠️ Please select a text segment and a code."
 
 
 
100
 
101
  # Extract the metadata values from the list
 
102
  meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
103
 
104
  # Find context: locate the start of the segment in the full text
 
105
  try:
106
- start_index = full_text.index(segment_text)
 
 
 
 
107
  # Take 100 characters before the segment for context
108
- context = full_text[max(0, start_index - 100): start_index]
109
- context = '...' + context.replace('\n', ' ')
110
  except ValueError:
111
- context = "Segment not found in transcript (may be due to formatting)."
112
 
113
  # Create the new row
114
  new_row = {
@@ -120,6 +137,7 @@ def apply_code(
120
  }
121
 
122
  # Append the new row to the DataFrame
 
123
  new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
124
 
125
  return new_df, "✅ Code applied successfully!"
@@ -131,9 +149,11 @@ def generate_excel(coded_data_df):
131
 
132
  output_path = "qualitative_codes.xlsx"
133
  # Ensure the 'openpyxl' engine is available for XLSX export
134
- coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
135
-
136
- return output_path, "✅ Excel file generated and ready for download."
 
 
137
 
138
 
139
  # --- 4. GRADIO INTERFACE ---
@@ -150,10 +170,8 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
150
  current_file_id = gr.State(value="")
151
  # Stores the full text content of the transcript
152
  full_transcript_text = gr.State(value="")
153
- # Stores the running list of codes
154
- coded_data_state = gr.State(
155
- value=pd.DataFrame(columns=["File ID", "Code", "Coded Segment", "Context (100 chars)"] + list(METADATA_FIELDS.keys()))
156
- )
157
 
158
  # --- A. FILE UPLOAD & METADATA ---
159
  with gr.Row():
@@ -164,6 +182,8 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
164
  )
165
  status_message = gr.Textbox(label="Status", value="Ready", scale=2)
166
 
 
 
167
  gr.Interface(
168
  fn=process_file,
169
  inputs=file_input,
@@ -171,7 +191,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
171
  api_name=False,
172
  live=False,
173
  # Hide the default UI generated by Interface (we handle it below)
174
- allow_flagging="never",
175
  ).clear()
176
 
177
  gr.Markdown("---")
@@ -187,14 +206,16 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
187
 
188
  # --- B. TRANSCRIPT VIEW ---
189
  gr.Markdown("## 📖 Transcript")
 
190
  # Display the full text (non-interactive so users copy from it)
191
  transcript_display = gr.Textbox(
192
  label="Transcript Content (Read-only - Copy segments from here)",
193
  lines=15,
194
  interactive=False,
195
  value="",
 
196
  )
197
- # Connect the state to the display box
198
  full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
199
 
200
  gr.Markdown("---")
@@ -224,12 +245,12 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
224
  interactive=False,
225
  height=300
226
  )
227
- # Initialize the dataframe display with the state
228
  coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
229
 
230
  with gr.Row():
231
  download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
232
- download_file = gr.File(label="Download File")
233
 
234
  # --- E. ACTION BINDINGS ---
235
 
@@ -252,10 +273,15 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
252
  fn=generate_excel,
253
  inputs=coded_data_state,
254
  outputs=[download_file, status_message]
 
 
 
 
 
255
  )
256
 
 
257
  # Launch the app
258
  if __name__ == "__main__":
259
- # Note: If running this, you may need to install:
260
- # pip install gradio pandas openpyxl python-docx
261
  demo.launch()
 
2
  import pandas as pd
3
  import os
4
  import io
5
+
6
  # You will need to install python-docx for .docx file support
7
  try:
8
  import docx
9
  except ImportError:
10
+ # Set docx to None so the application can still run, but DOCX will be disabled.
11
+ docx = None
 
12
 
13
  # --- 1. CONFIGURATION ---
14
 
 
21
  "Other: Follow-up Needed",
22
  ]
23
 
24
+ # Define the metadata fields you want to collect (key: variable name, value: UI label)
25
  METADATA_FIELDS = {
26
  "interview_id": "Interview ID (e.g., I-001)",
27
  "interview_date": "Date of Interview (YYYY-MM-DD)",
 
40
  doc = docx.Document(file_path)
41
  full_text = []
42
  for para in doc.paragraphs:
43
+ # Join paragraphs with a newline for better readability
44
  full_text.append(para.text)
45
  return '\n'.join(full_text)
46
 
 
54
  transcript_lines = []
55
  for line in lines:
56
  if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
57
+ # Add a space at the end of each line to combine them into a single transcript
58
  transcript_lines.append(line)
59
 
60
  return ' '.join(transcript_lines)
61
 
62
+ def get_initial_df():
63
+ """Returns an empty DataFrame with all necessary columns (metadata + core data)."""
64
+ core_columns = ["File ID", "Code", "Coded Segment", "Context (100 chars)"]
65
+ return pd.DataFrame(columns=core_columns + list(METADATA_FIELDS.keys()))
66
+
67
  def process_file(file_obj):
68
+ """Handles file upload and returns the plain text content and resets state."""
69
  if file_obj is None:
70
  return "", "No file uploaded.", ""
71
 
 
84
  except Exception as e:
85
  return "", f"Error reading file: {e}", ""
86
 
87
+ # Reset the coded data state when a new file is loaded
88
+ initial_coded_df = get_initial_df()
89
 
90
+ # The function returns the full text, a status message, the filename (ID), and the reset DataFrame
91
  return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
92
 
93
  # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
 
105
  # Check if a segment and code were provided
106
  if not segment_text or not selected_code:
107
  return coded_data_df, "⚠️ Please select a text segment and a code."
108
+ if not file_id:
109
+ return coded_data_df, "⚠️ Please upload a file first."
110
+
111
 
112
  # Extract the metadata values from the list
113
+ # metadata_values is a list of strings corresponding to the keys in METADATA_FIELDS
114
  meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
115
 
116
  # Find context: locate the start of the segment in the full text
117
+ context = "Context not available (segment match failed)"
118
  try:
119
+ # Normalize whitespace to improve matching chances
120
+ normalized_full_text = ' '.join(full_text.split())
121
+ normalized_segment = ' '.join(segment_text.split())
122
+
123
+ start_index = normalized_full_text.index(normalized_segment)
124
  # Take 100 characters before the segment for context
125
+ context = normalized_full_text[max(0, start_index - 100): start_index]
126
+ context = '...' + context.strip()
127
  except ValueError:
128
+ pass # Keep default error message if segment not found
129
 
130
  # Create the new row
131
  new_row = {
 
137
  }
138
 
139
  # Append the new row to the DataFrame
140
+ # Note: Use pd.concat for reliable appending in modern pandas
141
  new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
142
 
143
  return new_df, "✅ Code applied successfully!"
 
149
 
150
  output_path = "qualitative_codes.xlsx"
151
  # Ensure the 'openpyxl' engine is available for XLSX export
152
+ try:
153
+ coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
154
+ return output_path, "✅ Excel file generated and ready for download."
155
+ except Exception as e:
156
+ return None, f"❌ Error generating Excel file: {e}"
157
 
158
 
159
  # --- 4. GRADIO INTERFACE ---
 
170
  current_file_id = gr.State(value="")
171
  # Stores the full text content of the transcript
172
  full_transcript_text = gr.State(value="")
173
+ # Stores the running list of codes (DataFrame)
174
+ coded_data_state = gr.State(value=get_initial_df())
 
 
175
 
176
  # --- A. FILE UPLOAD & METADATA ---
177
  with gr.Row():
 
182
  )
183
  status_message = gr.Textbox(label="Status", value="Ready", scale=2)
184
 
185
+ # Use gr.Interface briefly just to handle the file upload and state update cleanly
186
+ # NOTE: The 'allow_flagging' argument has been removed to fix the TypeError.
187
  gr.Interface(
188
  fn=process_file,
189
  inputs=file_input,
 
191
  api_name=False,
192
  live=False,
193
  # Hide the default UI generated by Interface (we handle it below)
 
194
  ).clear()
195
 
196
  gr.Markdown("---")
 
206
 
207
  # --- B. TRANSCRIPT VIEW ---
208
  gr.Markdown("## 📖 Transcript")
209
+
210
  # Display the full text (non-interactive so users copy from it)
211
  transcript_display = gr.Textbox(
212
  label="Transcript Content (Read-only - Copy segments from here)",
213
  lines=15,
214
  interactive=False,
215
  value="",
216
+ elem_id="transcript-display"
217
  )
218
+ # Update the transcript display whenever the hidden state changes
219
  full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
220
 
221
  gr.Markdown("---")
 
245
  interactive=False,
246
  height=300
247
  )
248
+ # Update the dataframe display whenever the hidden state changes
249
  coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
250
 
251
  with gr.Row():
252
  download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
253
+ download_file = gr.File(label="Download File", visible=False) # Keep hidden until ready
254
 
255
  # --- E. ACTION BINDINGS ---
256
 
 
273
  fn=generate_excel,
274
  inputs=coded_data_state,
275
  outputs=[download_file, status_message]
276
+ ).then(
277
+ # Make the file visible after generation
278
+ lambda x: gr.update(visible=True) if x[0] else gr.update(visible=False),
279
+ inputs=[download_file],
280
+ outputs=[download_file]
281
  )
282
 
283
+
284
  # Launch the app
285
  if __name__ == "__main__":
286
+ # Ensure you have installed: pip install gradio pandas openpyxl python-docx
 
287
  demo.launch()