clementBE commited on
Commit
20a147b
Β·
verified Β·
1 Parent(s): 8e98afb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -0
app.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ import io
5
+ # You will need to install python-docx for .docx file support
6
+ try:
7
+ import docx
8
+ except ImportError:
9
+ print("Warning: 'python-docx' library not found. Install with: pip install python-docx")
10
+ print("DOCX files will not be supported.")
11
+ docx = None
12
+
13
+ # --- 1. CONFIGURATION ---
14
+
15
+ # Define the default codes for qualitative analysis
16
+ DEFAULT_CODES = [
17
+ "Theme: Communication Barrier",
18
+ "Theme: Emotional Support",
19
+ "Theme: Future Aspirations",
20
+ "Theme: Financial Stress",
21
+ "Other: Follow-up Needed",
22
+ ]
23
+
24
+ # Define the metadata fields you want to collect
25
+ METADATA_FIELDS = {
26
+ "interview_id": "Interview ID (e.g., I-001)",
27
+ "interview_date": "Date of Interview (YYYY-MM-DD)",
28
+ "occupation": "Participant Occupation",
29
+ "age": "Participant Age",
30
+ }
31
+
32
+
33
+ # --- 2. FILE PROCESSING FUNCTIONS ---
34
+
35
+ def read_docx(file_path):
36
+ """Extracts plain text from a .docx file."""
37
+ if not docx:
38
+ return "Error: python-docx library is not installed. Cannot read .docx."
39
+
40
+ doc = docx.Document(file_path)
41
+ full_text = []
42
+ for para in doc.paragraphs:
43
+ full_text.append(para.text)
44
+ return '\n'.join(full_text)
45
+
46
+ def read_vtt(file_path):
47
+ """Extracts text from a .vtt file (simply ignoring time codes/metadata)."""
48
+ with open(file_path, 'r', encoding='utf-8') as f:
49
+ content = f.read()
50
+
51
+ # Simple heuristic to strip VTT specific lines (WEBVTT, time stamps, blank lines)
52
+ lines = [line.strip() for line in content.split('\n')]
53
+ transcript_lines = []
54
+ for line in lines:
55
+ if line and not line.startswith("WEBVTT") and '-->' not in line and not line.isdigit():
56
+ transcript_lines.append(line)
57
+
58
+ return ' '.join(transcript_lines)
59
+
60
+ def process_file(file_obj):
61
+ """Handles file upload and returns the plain text content."""
62
+ if file_obj is None:
63
+ return "", "No file uploaded.", ""
64
+
65
+ file_path = file_obj.name
66
+ filename = os.path.basename(file_path)
67
+
68
+ if filename.lower().endswith('.docx'):
69
+ text_content = read_docx(file_path)
70
+ elif filename.lower().endswith('.vtt'):
71
+ text_content = read_vtt(file_path)
72
+ else:
73
+ # For simple text files (or as a fallback)
74
+ try:
75
+ with open(file_path, 'r', encoding='utf-8') as f:
76
+ text_content = f.read()
77
+ except Exception as e:
78
+ return "", f"Error reading file: {e}", ""
79
+
80
+ # Clear the coded data state when a new file is loaded
81
+ initial_coded_df = pd.DataFrame(columns=["File ID", "Code", "Coded Segment", "Context (100 chars)"])
82
+
83
+ return text_content, f"βœ… Loaded: {filename}", filename, initial_coded_df
84
+
85
+ # --- 3. CODING/DATA MANAGEMENT FUNCTIONS ---
86
+
87
+ def apply_code(
88
+ coded_data_df,
89
+ file_id,
90
+ full_text,
91
+ segment_text,
92
+ selected_code,
93
+ metadata_values
94
+ ):
95
+ """Adds a new coded segment and metadata to the DataFrame."""
96
+
97
+ # Check if a segment and code were provided
98
+ if not segment_text or not selected_code:
99
+ return coded_data_df, "⚠️ Please select a text segment and a code."
100
+
101
+ # Extract the metadata values from the list
102
+ meta_dict = dict(zip(METADATA_FIELDS.keys(), metadata_values))
103
+
104
+ # Find context: locate the start of the segment in the full text
105
+ try:
106
+ start_index = full_text.index(segment_text)
107
+ # Take 100 characters before the segment for context
108
+ context = full_text[max(0, start_index - 100): start_index]
109
+ context = '...' + context.replace('\n', ' ')
110
+ except ValueError:
111
+ context = "Segment not found in transcript (may be due to formatting)."
112
+
113
+ # Create the new row
114
+ new_row = {
115
+ "File ID": file_id,
116
+ "Code": selected_code,
117
+ "Coded Segment": segment_text,
118
+ "Context (100 chars)": context,
119
+ **meta_dict # Add all metadata fields to the row
120
+ }
121
+
122
+ # Append the new row to the DataFrame
123
+ new_df = pd.concat([coded_data_df, pd.Series(new_row).to_frame().T], ignore_index=True)
124
+
125
+ return new_df, "βœ… Code applied successfully!"
126
+
127
+ def generate_excel(coded_data_df):
128
+ """Generates and returns the path to the Excel file."""
129
+ if coded_data_df.empty:
130
+ return None, "⚠️ No codes have been applied yet."
131
+
132
+ output_path = "qualitative_codes.xlsx"
133
+ # Ensure the 'openpyxl' engine is available for XLSX export
134
+ coded_data_df.to_excel(output_path, index=False, engine='openpyxl')
135
+
136
+ return output_path, "βœ… Excel file generated and ready for download."
137
+
138
+
139
+ # --- 4. GRADIO INTERFACE ---
140
+
141
+ with gr.Blocks(title="Qualitative Coding Interface") as demo:
142
+ gr.Markdown("# πŸ“‘ Qualitative Coding Interface")
143
+ gr.Markdown(
144
+ "Upload a `.docx`, `.vtt`, or `.txt` transcript, add interview metadata, and then "
145
+ "copy text segments from the transcript box to the 'Segment to Code' box below to apply tags."
146
+ )
147
+
148
+ # --- State Management (Hidden) ---
149
+ # Stores the currently loaded filename
150
+ current_file_id = gr.State(value="")
151
+ # Stores the full text content of the transcript
152
+ full_transcript_text = gr.State(value="")
153
+ # Stores the running list of codes
154
+ coded_data_state = gr.State(
155
+ value=pd.DataFrame(columns=["File ID", "Code", "Coded Segment", "Context (100 chars)"] + list(METADATA_FIELDS.keys()))
156
+ )
157
+
158
+ # --- A. FILE UPLOAD & METADATA ---
159
+ with gr.Row():
160
+ file_input = gr.File(
161
+ label="Upload Transcript (.docx, .vtt, .txt)",
162
+ file_types=[".docx", ".vtt", ".txt"],
163
+ scale=1
164
+ )
165
+ status_message = gr.Textbox(label="Status", value="Ready", scale=2)
166
+
167
+ gr.Interface(
168
+ fn=process_file,
169
+ inputs=file_input,
170
+ outputs=[full_transcript_text, status_message, current_file_id, coded_data_state],
171
+ api_name=False,
172
+ live=False,
173
+ # Hide the default UI generated by Interface (we handle it below)
174
+ allow_flagging="never",
175
+ ).clear()
176
+
177
+ gr.Markdown("---")
178
+ gr.Markdown("## πŸ“ Interview Metadata")
179
+
180
+ # Create textboxes for each metadata field
181
+ metadata_inputs = []
182
+ with gr.Row():
183
+ for key, label in METADATA_FIELDS.items():
184
+ metadata_inputs.append(gr.Textbox(label=label, value="", max_lines=1, interactive=True))
185
+
186
+ gr.Markdown("---")
187
+
188
+ # --- B. TRANSCRIPT VIEW ---
189
+ gr.Markdown("## πŸ“– Transcript")
190
+ # Display the full text (non-interactive so users copy from it)
191
+ transcript_display = gr.Textbox(
192
+ label="Transcript Content (Read-only - Copy segments from here)",
193
+ lines=15,
194
+ interactive=False,
195
+ value="",
196
+ )
197
+ # Connect the state to the display box
198
+ full_transcript_text.change(lambda x: x, inputs=full_transcript_text, outputs=transcript_display)
199
+
200
+ gr.Markdown("---")
201
+
202
+ # --- C. CODING/TAGGING CONTROLS ---
203
+ gr.Markdown("## 🏷️ Apply Code")
204
+ with gr.Row():
205
+ segment_input = gr.Textbox(
206
+ label="Segment to Code (Paste the text you copied from above)",
207
+ lines=3,
208
+ scale=3
209
+ )
210
+ code_dropdown = gr.Dropdown(
211
+ label="Select Code/Tag",
212
+ choices=DEFAULT_CODES,
213
+ scale=1
214
+ )
215
+
216
+ code_btn = gr.Button("Apply Code & Save Segment", variant="primary")
217
+
218
+ # --- D. CODED DATA & DOWNLOAD ---
219
+ gr.Markdown("---")
220
+ gr.Markdown("## πŸ“Š Coded Data")
221
+
222
+ coded_output_df = gr.Dataframe(
223
+ label="Current Coded Segments",
224
+ interactive=False,
225
+ height=300
226
+ )
227
+ # Initialize the dataframe display with the state
228
+ coded_data_state.change(lambda x: x, inputs=coded_data_state, outputs=coded_output_df)
229
+
230
+ with gr.Row():
231
+ download_btn = gr.Button("Download Codes as XLSX", variant="secondary")
232
+ download_file = gr.File(label="Download File")
233
+
234
+ # --- E. ACTION BINDINGS ---
235
+
236
+ # 1. Apply Code Button Logic
237
+ code_btn.click(
238
+ fn=apply_code,
239
+ inputs=[
240
+ coded_data_state,
241
+ current_file_id,
242
+ full_transcript_text,
243
+ segment_input,
244
+ code_dropdown,
245
+ gr.List(metadata_inputs) # Pass all metadata inputs as a list
246
+ ],
247
+ outputs=[coded_data_state, status_message]
248
+ )
249
+
250
+ # 2. Download Button Logic
251
+ download_btn.click(
252
+ fn=generate_excel,
253
+ inputs=coded_data_state,
254
+ outputs=[download_file, status_message]
255
+ )
256
+
257
+ # Launch the app
258
+ if __name__ == "__main__":
259
+ # Note: If running this, you may need to install:
260
+ # pip install gradio pandas openpyxl python-docx
261
+ demo.launch()