clementBE commited on
Commit
2aa6081
Β·
verified Β·
1 Parent(s): 211688e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -189
app.py CHANGED
@@ -1,14 +1,15 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import os
4
- import io
5
 
6
  try:
7
  import docx
8
  except ImportError:
9
- docx = None
10
 
11
- # --- 1. CONFIGURATION ---
 
 
12
 
13
  DEFAULT_CODES = [
14
  "Theme: Communication Barrier",
@@ -26,33 +27,38 @@ METADATA_FIELDS = {
26
  }
27
 
28
 
29
- # --- 2. FILE PROCESSING FUNCTIONS ---
 
 
30
 
31
  def read_docx(file_path):
32
  if not docx:
33
- return "Error: python-docx library is not installed. Cannot read .docx."
34
-
35
- doc = docx.Document(file_path)
36
- full_text = []
37
- for para in doc.paragraphs:
38
- full_text.append(para.text)
39
- return '\n'.join(full_text)
40
 
41
  def read_vtt(file_path):
42
- with open(file_path, 'r', encoding='utf-8') as f:
43
  content = f.read()
44
-
45
- lines = [line.strip() for line in content.split('\n')]
46
- transcript_lines = []
47
- for line in lines:
48
- if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
49
- transcript_lines.append(line)
50
-
51
- return ' '.join(transcript_lines)
52
 
53
  def get_initial_df():
54
- core_columns = ["File ID", "Code", "Coded Segment", "Context (100 chars)"]
55
- return pd.DataFrame(columns=core_columns + list(METADATA_FIELDS.keys()))
 
 
 
 
 
56
 
57
  def process_file(file_obj):
58
  if file_obj is None:
@@ -60,191 +66,150 @@ def process_file(file_obj):
60
 
61
  file_path = file_obj.name
62
  filename = os.path.basename(file_path)
63
-
64
- if filename.lower().endswith('.docx'):
65
- text_content = read_docx(file_path)
66
- elif filename.lower().endswith('.vtt'):
67
- text_content = read_vtt(file_path)
68
  else:
69
- try:
70
- with open(file_path, 'r', encoding='utf-8') as f:
71
- text_content = f.read()
72
- except Exception as e:
73
- return "", f"Error reading file: {e}", "", get_initial_df()
74
-
75
- initial_coded_df = get_initial_df()
76
-
77
- return text_content, f"βœ… Loaded: {filename}", filename, initial_coded_df
78
-
79
- # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
80
-
81
- def apply_code(
82
- coded_data_df,
83
- file_id,
84
- full_text,
85
- segment_text,
86
- selected_code,
87
- metadata_values
88
- ):
89
  if not segment_text or not selected_code:
90
- return coded_data_df, "⚠️ Please select a text segment and a code."
91
- if not file_id:
92
- return coded_data_df, "⚠️ Please upload a file first."
93
 
94
  meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
95
-
96
- context = "Context not available (segment match failed)"
 
97
  try:
98
- normalized_full_text = ' '.join(full_text.split())
99
- normalized_segment = ' '.join(segment_text.split())
100
-
101
- start_index = normalized_full_text.index(normalized_segment)
102
- context = normalized_full_text[max(0, start_index - 100): start_index]
103
- context = '...' + context.strip()
104
- except ValueError:
105
- pass
106
-
107
  new_row = {
108
  "File ID": file_id,
109
  "Code": selected_code,
110
  "Coded Segment": segment_text,
111
  "Context (100 chars)": context,
112
- **meta_dict
113
  }
114
 
115
- new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
 
116
 
117
- return new_df, "βœ… Code applied successfully!"
118
 
119
- def generate_excel(coded_data_df):
120
- if coded_data_df.empty:
121
- return None, "⚠️ No codes have been applied yet."
122
-
123
- output_path = "qualitative_codes.xlsx"
124
- try:
125
- coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
126
- return output_path, "βœ… Excel file generated and ready for download."
127
- except Exception as e:
128
- return None, f"❌ Error generating Excel file: {e}"
129
 
 
 
 
130
 
131
- # --- 4. GRADIO INTERFACE ---
 
 
 
 
 
 
 
132
 
133
  with gr.Blocks(title="Qualitative Coding Interface") as demo:
 
134
  gr.Markdown("# πŸ“‘ Qualitative Coding Interface")
135
- gr.Markdown(
136
- "Upload a `.docx`, `.vtt`, or `.txt` transcript, add interview metadata, and then "
137
- "copy text segments from the transcript box to the 'Segment to Code' box below to apply tags."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  )
139
-
140
- # --- State Management (Hidden) ---
141
- current_file_id = gr.State(value="")
142
- full_transcript_text = gr.State(value="")
143
- coded_data_state = gr.State(value=get_initial_df())
144
-
145
- # *** FIX IMPLEMENTATION: Put all visible UI inside a single Column ***
146
- with gr.Column():
147
-
148
- # --- A. FILE UPLOAD & METADATA ---
149
- with gr.Row():
150
- file_input = gr.File(
151
- label="Upload Transcript (.docx, .vtt, .txt)",
152
- file_types=[".docx", ".vtt", ".txt"],
153
- scale=1
154
- )
155
- status_message = gr.Textbox(label="Status", value="Ready", scale=2)
156
-
157
- file_input.change(
158
- fn=process_file,
159
- inputs=file_input,
160
- outputs=[full_transcript_text, status_message, current_file_id, coded_data_state]
161
- )
162
-
163
- gr.Markdown("---")
164
- gr.Markdown("## πŸ“ Interview Metadata")
165
-
166
- metadata_inputs = []
167
- with gr.Row():
168
- for key, label in METADATA_FIELDS.items():
169
- metadata_inputs.append(gr.Textbox(label=label, value="", max_lines=1, interactive=True))
170
-
171
- gr.Markdown("---")
172
-
173
- # --- B. TRANSCRIPT VIEW ---
174
- gr.Markdown("## πŸ“– Transcript")
175
-
176
- transcript_display = gr.Textbox(
177
- label="Transcript Content (Read-only - Copy segments from here)",
178
- lines=15,
179
- interactive=False,
180
- value="",
181
- elem_id="transcript-display"
182
- )
183
- full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
184
-
185
- gr.Markdown("---")
186
-
187
- # --- C. CODING/TAGGING CONTROLS ---
188
- gr.Markdown("## 🏷️ Apply Code")
189
- with gr.Row():
190
- segment_input = gr.Textbox(
191
- label="Segment to Code (Paste the text you copied from above)",
192
- lines=3,
193
- scale=3
194
- )
195
- code_dropdown = gr.Dropdown(
196
- label="Select Code/Tag",
197
- choices=DEFAULT_CODES,
198
- scale=1
199
- )
200
-
201
- code_btn = gr.Button("Apply Code & Save Segment", variant="primary")
202
-
203
- # --- D. CODED DATA & DOWNLOAD ---
204
- gr.Markdown("---")
205
- gr.Markdown("## πŸ“Š Coded Data")
206
-
207
- coded_output_df = gr.Dataframe(
208
- label="Current Coded Segments",
209
- interactive=False,
210
- )
211
- coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
212
-
213
- with gr.Row():
214
- download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
215
- download_file = gr.File(label="Download File", visible=False)
216
-
217
- # --- E. ACTION BINDINGS ---
218
-
219
- code_btn.click(
220
- fn=apply_code,
221
- inputs=[
222
- coded_data_state,
223
- current_file_id,
224
- full_transcript_text,
225
- segment_input,
226
- code_dropdown,
227
- gr.List(metadata_inputs)
228
- ],
229
- outputs=[coded_data_state, status_message]
230
- )
231
-
232
- download_btn.click(
233
- fn=generate_excel,
234
- inputs=coded_data_state,
235
- outputs=[download_file, status_message]
236
- ).then(
237
- lambda x: gr.update(visible=True) if x[0] else gr.update(visible=False),
238
- inputs=[download_file],
239
- outputs=[download_file]
240
- )
241
-
242
-
243
- # Launch the app
244
  if __name__ == "__main__":
245
- # Use explicit server settings to ensure wide compatibility
246
- demo.launch(
247
- server_name="0.0.0.0",
248
- server_port=7860,
249
- share=False
250
- )
 
1
  import gradio as gr
2
  import pandas as pd
3
  import os
 
4
 
5
  try:
6
  import docx
7
  except ImportError:
8
+ docx = None
9
 
10
+ # ----------------------------
11
+ # CONFIG
12
+ # ----------------------------
13
 
14
  DEFAULT_CODES = [
15
  "Theme: Communication Barrier",
 
27
  }
28
 
29
 
30
+ # ----------------------------
31
+ # FILE READERS
32
+ # ----------------------------
33
 
34
  def read_docx(file_path):
35
  if not docx:
36
+ return "Error: python-docx is not installed."
37
+
38
+ document = docx.Document(file_path)
39
+ return "\n".join([p.text for p in document.paragraphs])
40
+
 
 
41
 
42
  def read_vtt(file_path):
43
+ with open(file_path, "r", encoding="utf-8") as f:
44
  content = f.read()
45
+
46
+ lines = [l.strip() for l in content.split("\n")]
47
+ transcript = [
48
+ l for l in lines
49
+ if l and not l.startswith("WEBVTT") and "-->" not in l and not l.isdigit()
50
+ ]
51
+ return " ".join(transcript)
52
+
53
 
54
  def get_initial_df():
55
+ cols = ["File ID", "Code", "Coded Segment", "Context (100 chars)"] + list(METADATA_FIELDS.keys())
56
+ return pd.DataFrame(columns=cols)
57
+
58
+
59
+ # ----------------------------
60
+ # FILE PROCESSING
61
+ # ----------------------------
62
 
63
  def process_file(file_obj):
64
  if file_obj is None:
 
66
 
67
  file_path = file_obj.name
68
  filename = os.path.basename(file_path)
69
+
70
+ if filename.lower().endswith(".docx"):
71
+ text = read_docx(file_path)
72
+ elif filename.lower().endswith(".vtt"):
73
+ text = read_vtt(file_path)
74
  else:
75
+ with open(file_path, "r", encoding="utf-8") as f:
76
+ text = f.read()
77
+
78
+ return text, f"Loaded: {filename}", filename, get_initial_df()
79
+
80
+
81
+ # ----------------------------
82
+ # CODING FUNCTION
83
+ # ----------------------------
84
+
85
+ def apply_code(coded_df, file_id, full_text, segment_text, selected_code, *metadata_values):
 
 
 
 
 
 
 
 
 
86
  if not segment_text or not selected_code:
87
+ return coded_df, "Please enter a segment and select a code."
 
 
88
 
89
  meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
90
+
91
+ # Extract simple context
92
+ context = "Context unavailable"
93
  try:
94
+ norm_full = " ".join(full_text.split())
95
+ norm_seg = " ".join(segment_text.split())
96
+ idx = norm_full.index(norm_seg)
97
+ context = "..." + norm_full[max(0, idx - 100):idx]
98
+ except:
99
+ pass
100
+
 
 
101
  new_row = {
102
  "File ID": file_id,
103
  "Code": selected_code,
104
  "Coded Segment": segment_text,
105
  "Context (100 chars)": context,
106
+ **meta_dict
107
  }
108
 
109
+ new_df = pd.concat([coded_df, pd.DataFrame([new_row])], ignore_index=True)
110
+ return new_df, "Code applied!"
111
 
 
112
 
113
+ # ----------------------------
114
+ # EXPORT EXCEL
115
+ # ----------------------------
 
 
 
 
 
 
 
116
 
117
+ def generate_excel(coded_df):
118
+ if coded_df.empty:
119
+ return None, "No coded segments yet."
120
 
121
+ out_path = "qualitative_codes.xlsx"
122
+ coded_df.to_excel(out_path, index=False)
123
+ return out_path, "Excel ready for download."
124
+
125
+
126
+ # ----------------------------
127
+ # GRADIO UI
128
+ # ----------------------------
129
 
130
  with gr.Blocks(title="Qualitative Coding Interface") as demo:
131
+
132
  gr.Markdown("# πŸ“‘ Qualitative Coding Interface")
133
+ gr.Markdown("Load transcripts β†’ add metadata β†’ code text segments β†’ export Excel.")
134
+
135
+ # Hidden states
136
+ current_file_id = gr.State("")
137
+ full_text_state = gr.State("")
138
+ coded_state = gr.State(get_initial_df())
139
+
140
+ # -----------------------
141
+ # File upload
142
+ # -----------------------
143
+ with gr.Row():
144
+ file_input = gr.File(label="Upload (.docx, .vtt, .txt)", file_types=[".docx", ".vtt", ".txt"])
145
+ status = gr.Textbox(label="Status", value="Ready")
146
+
147
+ file_input.change(
148
+ fn=process_file,
149
+ inputs=file_input,
150
+ outputs=[full_text_state, status, current_file_id, coded_state]
151
  )
152
+
153
+ # -----------------------
154
+ # Metadata
155
+ ------------------------
156
+ gr.Markdown("## πŸ“ Interview Metadata")
157
+ metadata_inputs = []
158
+ with gr.Row():
159
+ for key, lbl in METADATA_FIELDS.items():
160
+ box = gr.Textbox(label=lbl)
161
+ metadata_inputs.append(box)
162
+
163
+ # -----------------------
164
+ # Transcript
165
+ # -----------------------
166
+ gr.Markdown("## πŸ“– Transcript")
167
+ transcript_box = gr.Textbox(label="Transcript (read-only)", lines=15, interactive=False)
168
+ full_text_state.change(lambda x: x, inputs=full_text_state, outputs=transcript_box)
169
+
170
+ # -----------------------
171
+ # Coding controls
172
+ # -----------------------
173
+ gr.Markdown("## 🏷️ Apply Code")
174
+ with gr.Row():
175
+ segment_input = gr.Textbox(label="Paste segment", lines=3)
176
+ code_choice = gr.Dropdown(label="Select Code", choices=DEFAULT_CODES)
177
+
178
+ code_btn = gr.Button("Apply Code")
179
+
180
+ code_btn.click(
181
+ fn=apply_code,
182
+ inputs=[coded_state, current_file_id, full_text_state, segment_input, code_choice] + metadata_inputs,
183
+ outputs=[coded_state, status]
184
+ )
185
+
186
+ # -----------------------
187
+ # Show Data
188
+ # -----------------------
189
+ gr.Markdown("## πŸ“Š Coded Data")
190
+ data_table = gr.Dataframe(interactive=False)
191
+ coded_state.change(lambda x: x, inputs=coded_state, outputs=data_table)
192
+
193
+ # -----------------------
194
+ # Download Excel
195
+ # -----------------------
196
+ download_btn = gr.Button("Download XLSX")
197
+ download_file = gr.File(label="Download", visible=False)
198
+
199
+ def show_file(file):
200
+ return gr.update(visible=file is not None)
201
+
202
+ download_btn.click(
203
+ generate_excel,
204
+ inputs=coded_state,
205
+ outputs=[download_file, status]
206
+ ).then(
207
+ show_file,
208
+ inputs=download_file,
209
+ outputs=download_file
210
+ )
211
+
212
+
213
+ # Launch for HF Spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  if __name__ == "__main__":
215
+ demo.launch()