clementBE commited on
Commit
211688e
·
verified ·
1 Parent(s): 96b2741

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -45
app.py CHANGED
@@ -3,16 +3,13 @@ import pandas as pd
3
  import os
4
  import io
5
 
6
- # You will need to install python-docx for .docx file support
7
  try:
8
  import docx
9
  except ImportError:
10
- # Set docx to None so the application can still run, but DOCX will be disabled.
11
  docx = None
12
 
13
  # --- 1. CONFIGURATION ---
14
 
15
- # Define the default codes for qualitative analysis
16
  DEFAULT_CODES = [
17
  "Theme: Communication Barrier",
18
  "Theme: Emotional Support",
@@ -21,7 +18,6 @@ DEFAULT_CODES = [
21
  "Other: Follow-up Needed",
22
  ]
23
 
24
- # Define the metadata fields you want to collect (key: variable name, value: UI label)
25
  METADATA_FIELDS = {
26
  "interview_id": "Interview ID (e.g., I-001)",
27
  "interview_date": "Date of Interview (YYYY-MM-DD)",
@@ -33,41 +29,33 @@ METADATA_FIELDS = {
33
  # --- 2. FILE PROCESSING FUNCTIONS ---
34
 
35
  def read_docx(file_path):
36
- """Extracts plain text from a .docx file."""
37
  if not docx:
38
  return "Error: python-docx library is not installed. Cannot read .docx."
39
 
40
  doc = docx.Document(file_path)
41
  full_text = []
42
  for para in doc.paragraphs:
43
- # Join paragraphs with a newline for better readability
44
  full_text.append(para.text)
45
  return '\n'.join(full_text)
46
 
47
  def read_vtt(file_path):
48
- """Extracts text from a .vtt file (simply ignoring time codes/metadata)."""
49
  with open(file_path, 'r', encoding='utf-8') as f:
50
  content = f.read()
51
 
52
- # Simple heuristic to strip VTT specific lines (WEBVTT, time stamps, blank lines)
53
  lines = [line.strip() for line in content.split('\n')]
54
  transcript_lines = []
55
  for line in lines:
56
  if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
57
- # Add a space at the end of each line to combine them into a single transcript
58
  transcript_lines.append(line)
59
 
60
  return ' '.join(transcript_lines)
61
 
62
  def get_initial_df():
63
- """Returns an empty DataFrame with all necessary columns (metadata + core data)."""
64
  core_columns = ["File ID", "Code", "Coded Segment", "Context (100 chars)"]
65
  return pd.DataFrame(columns=core_columns + list(METADATA_FIELDS.keys()))
66
 
67
  def process_file(file_obj):
68
- """Handles file upload and returns the plain text content and resets state."""
69
  if file_obj is None:
70
- # Return default values to clear the interface
71
  return "", "No file uploaded.", "", get_initial_df()
72
 
73
  file_path = file_obj.name
@@ -78,17 +66,14 @@ def process_file(file_obj):
78
  elif filename.lower().endswith('.vtt'):
79
  text_content = read_vtt(file_path)
80
  else:
81
- # For simple text files (or as a fallback)
82
  try:
83
  with open(file_path, 'r', encoding='utf-8') as f:
84
  text_content = f.read()
85
  except Exception as e:
86
  return "", f"Error reading file: {e}", "", get_initial_df()
87
 
88
- # Reset the coded data state when a new file is loaded
89
  initial_coded_df = get_initial_df()
90
 
91
- # The function returns the full text, a status message, the filename (ID), and the reset DataFrame
92
  return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
93
 
94
  # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
@@ -101,54 +86,41 @@ def apply_code(
101
  selected_code,
102
  metadata_values
103
  ):
104
- """Adds a new coded segment and metadata to the DataFrame."""
105
-
106
- # Check if a segment and code were provided
107
  if not segment_text or not selected_code:
108
  return coded_data_df, "⚠️ Please select a text segment and a code."
109
  if not file_id:
110
  return coded_data_df, "⚠️ Please upload a file first."
111
 
112
-
113
- # Extract the metadata values from the list
114
  meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
115
 
116
- # Find context: locate the start of the segment in the full text
117
  context = "Context not available (segment match failed)"
118
  try:
119
- # Normalize whitespace to improve matching chances
120
  normalized_full_text = ' '.join(full_text.split())
121
  normalized_segment = ' '.join(segment_text.split())
122
 
123
  start_index = normalized_full_text.index(normalized_segment)
124
- # Take 100 characters before the segment for context
125
  context = normalized_full_text[max(0, start_index - 100): start_index]
126
  context = '...' + context.strip()
127
  except ValueError:
128
- pass # Keep default error message if segment not found
129
 
130
- # Create the new row
131
  new_row = {
132
  "File ID": file_id,
133
  "Code": selected_code,
134
  "Coded Segment": segment_text,
135
  "Context (100 chars)": context,
136
- **meta_dict # Add all metadata fields to the row
137
  }
138
 
139
- # Append the new row to the DataFrame
140
- # Note: Use pd.concat for reliable appending in modern pandas
141
  new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
142
 
143
  return new_df, "✅ Code applied successfully!"
144
 
145
  def generate_excel(coded_data_df):
146
- """Generates and returns the path to the Excel file."""
147
  if coded_data_df.empty:
148
  return None, "⚠️ No codes have been applied yet."
149
 
150
  output_path = "qualitative_codes.xlsx"
151
- # Ensure the 'openpyxl' engine is available for XLSX export
152
  try:
153
  coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
154
  return output_path, "✅ Excel file generated and ready for download."
@@ -166,11 +138,8 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
166
  )
167
 
168
  # --- State Management (Hidden) ---
169
- # Stores the currently loaded filename
170
  current_file_id = gr.State(value="")
171
- # Stores the full text content of the transcript
172
  full_transcript_text = gr.State(value="")
173
- # Stores the running list of codes (DataFrame)
174
  coded_data_state = gr.State(value=get_initial_df())
175
 
176
  # *** FIX IMPLEMENTATION: Put all visible UI inside a single Column ***
@@ -185,7 +154,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
185
  )
186
  status_message = gr.Textbox(label="Status", value="Ready", scale=2)
187
 
188
- # File upload change event
189
  file_input.change(
190
  fn=process_file,
191
  inputs=file_input,
@@ -195,7 +163,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
195
  gr.Markdown("---")
196
  gr.Markdown("## 📝 Interview Metadata")
197
 
198
- # Create textboxes for each metadata field
199
  metadata_inputs = []
200
  with gr.Row():
201
  for key, label in METADATA_FIELDS.items():
@@ -206,7 +173,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
206
  # --- B. TRANSCRIPT VIEW ---
207
  gr.Markdown("## 📖 Transcript")
208
 
209
- # Display the full text (non-interactive so users copy from it)
210
  transcript_display = gr.Textbox(
211
  label="Transcript Content (Read-only - Copy segments from here)",
212
  lines=15,
@@ -214,7 +180,6 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
214
  value="",
215
  elem_id="transcript-display"
216
  )
217
- # Update the transcript display whenever the hidden state changes
218
  full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
219
 
220
  gr.Markdown("---")
@@ -243,16 +208,14 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
243
  label="Current Coded Segments",
244
  interactive=False,
245
  )
246
- # Update the dataframe display whenever the hidden state changes
247
  coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
248
 
249
  with gr.Row():
250
  download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
251
- download_file = gr.File(label="Download File", visible=False) # Keep hidden until ready
252
 
253
  # --- E. ACTION BINDINGS ---
254
 
255
- # 1. Apply Code Button Logic
256
  code_btn.click(
257
  fn=apply_code,
258
  inputs=[
@@ -261,18 +224,16 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
261
  full_transcript_text,
262
  segment_input,
263
  code_dropdown,
264
- gr.List(metadata_inputs) # Pass all metadata inputs as a list
265
  ],
266
  outputs=[coded_data_state, status_message]
267
  )
268
 
269
- # 2. Download Button Logic
270
  download_btn.click(
271
  fn=generate_excel,
272
  inputs=coded_data_state,
273
  outputs=[download_file, status_message]
274
  ).then(
275
- # Make the file visible after generation
276
  lambda x: gr.update(visible=True) if x[0] else gr.update(visible=False),
277
  inputs=[download_file],
278
  outputs=[download_file]
@@ -281,5 +242,9 @@ with gr.Blocks(title="Qualitative Coding Interface") as demo:
281
 
282
  # Launch the app
283
  if __name__ == "__main__":
284
- # Ensure you have installed: pip install gradio pandas openpyxl python-docx
285
- demo.launch()
 
 
 
 
 
3
  import os
4
  import io
5
 
 
6
  try:
7
  import docx
8
  except ImportError:
 
9
  docx = None
10
 
11
  # --- 1. CONFIGURATION ---
12
 
 
13
  DEFAULT_CODES = [
14
  "Theme: Communication Barrier",
15
  "Theme: Emotional Support",
 
18
  "Other: Follow-up Needed",
19
  ]
20
 
 
21
  METADATA_FIELDS = {
22
  "interview_id": "Interview ID (e.g., I-001)",
23
  "interview_date": "Date of Interview (YYYY-MM-DD)",
 
29
  # --- 2. FILE PROCESSING FUNCTIONS ---
30
 
31
  def read_docx(file_path):
 
32
  if not docx:
33
  return "Error: python-docx library is not installed. Cannot read .docx."
34
 
35
  doc = docx.Document(file_path)
36
  full_text = []
37
  for para in doc.paragraphs:
 
38
  full_text.append(para.text)
39
  return '\n'.join(full_text)
40
 
41
  def read_vtt(file_path):
 
42
  with open(file_path, 'r', encoding='utf-8') as f:
43
  content = f.read()
44
 
 
45
  lines = [line.strip() for line in content.split('\n')]
46
  transcript_lines = []
47
  for line in lines:
48
  if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
 
49
  transcript_lines.append(line)
50
 
51
  return ' '.join(transcript_lines)
52
 
53
  def get_initial_df():
 
54
  core_columns = ["File ID", "Code", "Coded Segment", "Context (100 chars)"]
55
  return pd.DataFrame(columns=core_columns + list(METADATA_FIELDS.keys()))
56
 
57
  def process_file(file_obj):
 
58
  if file_obj is None:
 
59
  return "", "No file uploaded.", "", get_initial_df()
60
 
61
  file_path = file_obj.name
 
66
  elif filename.lower().endswith('.vtt'):
67
  text_content = read_vtt(file_path)
68
  else:
 
69
  try:
70
  with open(file_path, 'r', encoding='utf-8') as f:
71
  text_content = f.read()
72
  except Exception as e:
73
  return "", f"Error reading file: {e}", "", get_initial_df()
74
 
 
75
  initial_coded_df = get_initial_df()
76
 
 
77
  return text_content, f"✅ Loaded: {filename}", filename, initial_coded_df
78
 
79
  # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
 
86
  selected_code,
87
  metadata_values
88
  ):
 
 
 
89
  if not segment_text or not selected_code:
90
  return coded_data_df, "⚠️ Please select a text segment and a code."
91
  if not file_id:
92
  return coded_data_df, "⚠️ Please upload a file first."
93
 
 
 
94
  meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
95
 
 
96
  context = "Context not available (segment match failed)"
97
  try:
 
98
  normalized_full_text = ' '.join(full_text.split())
99
  normalized_segment = ' '.join(segment_text.split())
100
 
101
  start_index = normalized_full_text.index(normalized_segment)
 
102
  context = normalized_full_text[max(0, start_index - 100): start_index]
103
  context = '...' + context.strip()
104
  except ValueError:
105
+ pass
106
 
 
107
  new_row = {
108
  "File ID": file_id,
109
  "Code": selected_code,
110
  "Coded Segment": segment_text,
111
  "Context (100 chars)": context,
112
+ **meta_dict
113
  }
114
 
 
 
115
  new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
116
 
117
  return new_df, "✅ Code applied successfully!"
118
 
119
  def generate_excel(coded_data_df):
 
120
  if coded_data_df.empty:
121
  return None, "⚠️ No codes have been applied yet."
122
 
123
  output_path = "qualitative_codes.xlsx"
 
124
  try:
125
  coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
126
  return output_path, "✅ Excel file generated and ready for download."
 
138
  )
139
 
140
  # --- State Management (Hidden) ---
 
141
  current_file_id = gr.State(value="")
 
142
  full_transcript_text = gr.State(value="")
 
143
  coded_data_state = gr.State(value=get_initial_df())
144
 
145
  # *** FIX IMPLEMENTATION: Put all visible UI inside a single Column ***
 
154
  )
155
  status_message = gr.Textbox(label="Status", value="Ready", scale=2)
156
 
 
157
  file_input.change(
158
  fn=process_file,
159
  inputs=file_input,
 
163
  gr.Markdown("---")
164
  gr.Markdown("## 📝 Interview Metadata")
165
 
 
166
  metadata_inputs = []
167
  with gr.Row():
168
  for key, label in METADATA_FIELDS.items():
 
173
  # --- B. TRANSCRIPT VIEW ---
174
  gr.Markdown("## 📖 Transcript")
175
 
 
176
  transcript_display = gr.Textbox(
177
  label="Transcript Content (Read-only - Copy segments from here)",
178
  lines=15,
 
180
  value="",
181
  elem_id="transcript-display"
182
  )
 
183
  full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
184
 
185
  gr.Markdown("---")
 
208
  label="Current Coded Segments",
209
  interactive=False,
210
  )
 
211
  coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
212
 
213
  with gr.Row():
214
  download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
215
+ download_file = gr.File(label="Download File", visible=False)
216
 
217
  # --- E. ACTION BINDINGS ---
218
 
 
219
  code_btn.click(
220
  fn=apply_code,
221
  inputs=[
 
224
  full_transcript_text,
225
  segment_input,
226
  code_dropdown,
227
+ gr.List(metadata_inputs)
228
  ],
229
  outputs=[coded_data_state, status_message]
230
  )
231
 
 
232
  download_btn.click(
233
  fn=generate_excel,
234
  inputs=coded_data_state,
235
  outputs=[download_file, status_message]
236
  ).then(
 
237
  lambda x: gr.update(visible=True) if x[0] else gr.update(visible=False),
238
  inputs=[download_file],
239
  outputs=[download_file]
 
242
 
243
  # Launch the app
244
  if __name__ == "__main__":
245
+ # Use explicit server settings to ensure wide compatibility
246
+ demo.launch(
247
+ server_name="0.0.0.0",
248
+ server_port=7860,
249
+ share=False
250
+ )