ahm14 commited on
Commit
ba880a7
·
verified ·
1 Parent(s): eccdab3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +332 -62
app.py CHANGED
@@ -5,85 +5,355 @@ import re
5
  import logging
6
  import nltk
7
  from docx import Document
8
- from collections import Counter
 
9
  import io
 
 
10
  from dotenv import load_dotenv
 
 
 
 
11
 
12
  # Load environment variables
13
  load_dotenv()
14
 
 
 
 
 
 
 
15
  # Initialize logging
16
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
17
 
 
 
 
18
  # Download required NLTK resources
19
  nltk.download("punkt")
20
 
 
21
  st.title("AI-Powered Coding Sheet Generator")
22
- st.write("Enter text or upload a DOCX/Excel file for analysis:")
23
-
24
- # Option to enable separate tab feature
25
- separate_tab = st.checkbox("Enable Separate Tab for Summary")
26
 
27
- input_text = st.text_area("Input Text", height=200)
28
- uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
29
- uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
 
 
30
 
31
- output_data = {}
32
-
33
- # Function to extract text from DOCX
34
- def extract_text_from_docx(docx_file):
35
- doc = Document(docx_file)
36
- return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
37
-
38
- # Function to analyze summary data
39
- def generate_summary(data):
40
- total_posts = len(data)
41
- tones = Counter()
42
- languages = Counter()
43
- frames = Counter()
44
- frame_focus = {"Major Focus": Counter(), "Significant Focus": Counter(), "Minor Mention": Counter(), "Not Applicable": Counter()}
45
-
46
- for post in data.values():
47
- tones.update(post.get("Tone", []))
48
- languages[post.get("Language", "Unknown")] += 1
49
- frame_mapping = post.get("FramesMapping", {})
50
- for frame, focus in frame_mapping.items():
51
- frames[frame] += 1
52
- frame_focus[focus][frame] += 1
53
-
54
- abstract = f"This document contains {total_posts} posts. The most commonly used tone is '{tones.most_common(1)}'. "
55
- abstract += f"The most frequently mentioned frame is '{frames.most_common(1)}'. Languages used include {list(languages.keys())}."
56
-
57
- return total_posts, tones, languages, frames, frame_focus, abstract
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- # Function to create an Excel summary
60
- def create_summary_excel(summary_data):
61
- total_posts, tones, languages, frames, frame_focus, abstract = summary_data
62
- with io.BytesIO() as buffer:
63
- writer = pd.ExcelWriter(buffer, engine='xlsxwriter')
 
 
64
 
65
- pd.DataFrame(tones.items(), columns=["Tone", "Count"]).to_excel(writer, sheet_name="Tones", index=False)
66
- pd.DataFrame(languages.items(), columns=["Language", "Count"]).to_excel(writer, sheet_name="Languages", index=False)
67
- pd.DataFrame(frames.items(), columns=["Frame", "Count"]).to_excel(writer, sheet_name="Frames", index=False)
68
 
69
- for focus, counts in frame_focus.items():
70
- pd.DataFrame(counts.items(), columns=["Frame", "Count"]).to_excel(writer, sheet_name=focus, index=False)
 
 
 
71
 
72
- pd.DataFrame({"Abstract": [abstract]}).to_excel(writer, sheet_name="Abstract", index=False)
 
 
 
 
 
 
 
 
 
73
 
74
- writer.close()
75
- buffer.seek(0)
76
- return buffer.getvalue()
77
-
78
- if uploaded_docx:
79
- docx_text = extract_text_from_docx(uploaded_docx)
80
- summary_data = generate_summary({"Uploaded DOCX": {"Full Caption": docx_text}})
81
- if separate_tab:
82
- with st.expander("Summary Tab"):
83
- st.write(f"Total Posts: {summary_data[0]}")
84
- st.write(f"Tones: {dict(summary_data[1])}")
85
- st.write(f"Languages: {dict(summary_data[2])}")
86
- st.write(f"Frames: {dict(summary_data[3])}")
87
- st.write(f"Abstract: {summary_data[5]}")
88
- excel_data = create_summary_excel(summary_data)
89
- st.download_button("Download Summary as Excel", data=excel_data, file_name="summary.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
 
 
 
 
 
 
 
 
 
 
 
5
  import logging
6
  import nltk
7
  from docx import Document
8
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
9
+ from docx.shared import Pt
10
  import io
11
+ from langdetect import detect
12
+ from collections import Counter
13
  from dotenv import load_dotenv
14
+ from langchain_groq import ChatGroq
15
+ from langchain_core.output_parsers import StrOutputParser
16
+ from langchain_core.prompts import ChatPromptTemplate
17
+ from transformers import pipeline
18
 
19
  # Load environment variables
20
  load_dotenv()
21
 
22
+ # Check if Groq API key is available
23
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
24
+ if not GROQ_API_KEY:
25
+ logging.error("Missing Groq API key. Please set the GROQ_API_KEY environment variable.")
26
+ st.error("API key is missing. Please provide a valid API key.")
27
+
28
  # Initialize logging
29
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
30
 
31
+ # Initialize LLM (Groq API)
32
+ llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192")
33
+
34
  # Download required NLTK resources
35
  nltk.download("punkt")
36
 
37
+ # Streamlit App UI
38
  st.title("AI-Powered Coding Sheet Generator")
39
+ tabs = st.tabs(["Text Analysis", "DOCX Processing"])
 
 
 
40
 
41
+ with tabs[0]:
42
+ st.write("Enter text or upload a DOCX/Excel file for analysis:")
43
+ input_text = st.text_area("Input Text", height=200)
44
+ uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"], key="docx1")
45
+ uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
46
 
47
+ # Existing processing logic...
48
+ # Tone categories for fallback method
49
+ tone_categories = {
50
+ "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
51
+ "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
52
+ "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
53
+ "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
54
+ "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
55
+ "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
56
+ "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
57
+ "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
58
+ "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
59
+ "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
60
+ }
61
+
62
+ # Frame categories for fallback method
63
+ frame_categories = {
64
+ "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
65
+ "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
66
+ "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
67
+ "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
68
+ "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
69
+ "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
70
+ "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
71
+ "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
72
+ "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
73
+ "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
74
+ "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
75
+ "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
76
+ "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
77
+ "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
78
+ "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
79
+ }
80
+
81
+ # Detect language
82
+ def detect_language(text):
83
+ try:
84
+ return detect(text)
85
+ except Exception as e:
86
+ logging.error(f"Error detecting language: {e}")
87
+ return "unknown"
88
+
89
+ # Extract tone using Groq API (or fallback method)
90
+ def extract_tone(text):
91
+ try:
92
+ response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
93
+ {"role": "user", "content": text}])
94
+ return response["choices"][0]["message"]["content"].split(", ")
95
+ except Exception as e:
96
+ logging.error(f"Groq API error: {e}")
97
+ return extract_tone_fallback(text)
98
+
99
+ # Fallback method for tone extraction
100
+ def extract_tone_fallback(text):
101
+ detected_tones = set()
102
+ text_lower = text.lower()
103
+ for category, keywords in tone_categories.items():
104
+ if any(word in text_lower for word in keywords):
105
+ detected_tones.add(category)
106
+ return list(detected_tones) if detected_tones else ["Neutral"]
107
+
108
+ # Extract hashtags
109
+ def extract_hashtags(text):
110
+ return re.findall(r"#\w+", text)
111
+
112
+ # -------------------------------------------------------------------
113
+ # New functions for frame categorization and display
114
+ # -------------------------------------------------------------------
115
+
116
+ def get_frame_category_mapping(text):
117
+ """
118
+ Returns a mapping of every frame (from frame_categories) to one of the four categories.
119
+ Detected frames are assigned a focus level based on keyword frequency:
120
+ - Top detected: "Major Focus"
121
+ - Next up to two: "Significant Focus"
122
+ - Remaining detected: "Minor Mention"
123
+ Frames not detected get "Not Applicable".
124
+ """
125
+ text_lower = text.lower()
126
+ # Calculate frequency for each frame
127
+ frame_freq = {}
128
+ for frame, keywords in frame_categories.items():
129
+ freq = sum(1 for word in keywords if word in text_lower)
130
+ frame_freq[frame] = freq
131
+
132
+ # Identify detected frames (frequency > 0) and sort descending
133
+ detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
134
+ detected.sort(key=lambda x: x[1], reverse=True)
135
+
136
+ category_mapping = {}
137
+ if detected:
138
+ # Highest frequency frame as Major Focus
139
+ category_mapping[detected[0][0]] = "Major Focus"
140
+ # Next up to two frames as Significant Focus
141
+ for frame, _ in detected[1:3]:
142
+ category_mapping[frame] = "Significant Focus"
143
+ # Remaining detected frames as Minor Mention
144
+ for frame, _ in detected[3:]:
145
+ category_mapping[frame] = "Minor Mention"
146
+ # For frames not detected, assign Not Applicable
147
+ for frame in frame_categories.keys():
148
+ if frame not in category_mapping:
149
+ category_mapping[frame] = "Not Applicable"
150
+ return category_mapping
151
+
152
+ def format_frame_categories_table(category_mapping):
153
+ """
154
+ Returns a markdown-formatted table displaying each frame with columns:
155
+ Major Focus, Significant Focus, Minor Mention, and Not Applicable.
156
+ A tick (✓) marks the assigned category.
157
+ """
158
+ header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
159
+ header += "| --- | --- | --- | --- | --- |\n"
160
+ tick = "✓"
161
+ rows = ""
162
+ for frame, category in category_mapping.items():
163
+ major = tick if category == "Major Focus" else ""
164
+ significant = tick if category == "Significant Focus" else ""
165
+ minor = tick if category == "Minor Mention" else ""
166
+ not_applicable = tick if category == "Not Applicable" else ""
167
+ rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n"
168
+ return header + rows
169
+
170
+ # -------------------------------------------------------------------
171
+ # Existing functions for file processing
172
+ # -------------------------------------------------------------------
173
+
174
+ def extract_captions_from_docx(docx_file):
175
+ doc = Document(docx_file)
176
+ captions = {}
177
+ current_post = None
178
+ for para in doc.paragraphs:
179
+ text = para.text.strip()
180
+ if re.match(r"Post \d+", text, re.IGNORECASE):
181
+ current_post = text
182
+ captions[current_post] = []
183
+ elif current_post:
184
+ captions[current_post].append(text)
185
+ return {post: " ".join(lines) for post, lines in captions.items() if lines}
186
+
187
+ def extract_metadata_from_excel(excel_file):
188
+ try:
189
+ df = pd.read_excel(excel_file)
190
+ extracted_data = df.to_dict(orient="records")
191
+ return extracted_data
192
+ except Exception as e:
193
+ logging.error(f"Error processing Excel file: {e}")
194
+ return []
195
+
196
+ def merge_metadata_with_generated_data(generated_data, excel_metadata):
197
+ for post_data in excel_metadata:
198
+ post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
199
+ if post_number in generated_data:
200
+ generated_data[post_number].update(post_data)
201
+ else:
202
+ generated_data[post_number] = post_data
203
+ return generated_data
204
+
205
+ def create_docx_from_data(extracted_data):
206
+ doc = Document()
207
+ for post_number, data in extracted_data.items():
208
+ doc.add_heading(post_number, level=1)
209
+ ordered_keys = [
210
+ "Post Number", "Date of Post", "Media Type", "Number of Pictures",
211
+ "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
212
+ "Full Caption", "Language", "Tone", "Hashtags"
213
+ ]
214
+ for key in ordered_keys:
215
+ value = data.get(key, "N/A")
216
+ if key in ["Tone", "Hashtags"]:
217
+ value = ", ".join(value) if isinstance(value, list) else value
218
+ para = doc.add_paragraph()
219
+ run = para.add_run(f"**{key}:** {value}")
220
+ run.font.size = Pt(11)
221
+ # Add a proper table for Frames if a mapping is available.
222
+ if "FramesMapping" in data:
223
+ doc.add_paragraph("Frames:")
224
+ mapping = data["FramesMapping"]
225
+ table = doc.add_table(rows=1, cols=5)
226
+ table.style = "Light List Accent 1"
227
+ hdr_cells = table.rows[0].cells
228
+ hdr_cells[0].text = "Frame"
229
+ hdr_cells[1].text = "Major Focus"
230
+ hdr_cells[2].text = "Significant Focus"
231
+ hdr_cells[3].text = "Minor Mention"
232
+ hdr_cells[4].text = "Not Applicable"
233
+ tick = "✓"
234
+ for frame, category in mapping.items():
235
+ row_cells = table.add_row().cells
236
+ row_cells[0].text = frame
237
+ row_cells[1].text = tick if category == "Major Focus" else ""
238
+ row_cells[2].text = tick if category == "Significant Focus" else ""
239
+ row_cells[3].text = tick if category == "Minor Mention" else ""
240
+ row_cells[4].text = tick if category == "Not Applicable" else ""
241
+ else:
242
+ value = data.get("Frames", "N/A")
243
+ doc.add_paragraph(f"**Frames:** {value}")
244
+ doc.add_paragraph("\n")
245
+ return doc
246
+
247
+ # -------------------------------------------------------------------
248
+ # Streamlit App UI
249
+ # -------------------------------------------------------------------
250
+
251
+ st.title("AI-Powered Coding Sheet Generator")
252
+ st.write("Enter text or upload a DOCX/Excel file for analysis:")
253
+
254
+ input_text = st.text_area("Input Text", height=200)
255
+ uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
256
+ uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
257
+
258
+ output_data = {}
259
+
260
+ if input_text:
261
+ frame_mapping = get_frame_category_mapping(input_text)
262
+ frames_table = format_frame_categories_table(frame_mapping)
263
+ output_data["Manual Input"] = {
264
+ "Full Caption": input_text,
265
+ "Language": detect_language(input_text),
266
+ "Tone": extract_tone(input_text),
267
+ "Hashtags": extract_hashtags(input_text),
268
+ "Frames": frames_table,
269
+ "FramesMapping": frame_mapping
270
+ }
271
+
272
+ if uploaded_docx:
273
+ captions = extract_captions_from_docx(uploaded_docx)
274
+ for caption, text in captions.items():
275
+ frame_mapping = get_frame_category_mapping(text)
276
+ frames_table = format_frame_categories_table(frame_mapping)
277
+ output_data[caption] = {
278
+ "Full Caption": text,
279
+ "Language": detect_language(text),
280
+ "Tone": extract_tone(text),
281
+ "Hashtags": extract_hashtags(text),
282
+ "Frames": frames_table,
283
+ "FramesMapping": frame_mapping
284
+ }
285
+
286
+ if uploaded_excel:
287
+ excel_metadata = extract_metadata_from_excel(uploaded_excel)
288
+ output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
289
+
290
+ if output_data:
291
+ for post_number, data in output_data.items():
292
+ with st.expander(post_number):
293
+ for key, value in data.items():
294
+ if key == "Frames":
295
+ st.markdown(f"**{key}:**\n{value}")
296
+ else:
297
+ st.write(f"**{key}:** {value}")
298
+
299
+ if output_data:
300
+ docx_output = create_docx_from_data(output_data)
301
+ docx_io = io.BytesIO()
302
+ docx_output.save(docx_io)
303
+ docx_io.seek(0)
304
+ st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")
305
 
306
+ with tabs[1]:
307
+ st.write("Upload a DOCX file for document-wide processing:")
308
+ uploaded_docx2 = st.file_uploader("Upload a DOCX file", type=["docx"], key="docx2")
309
+
310
+ if uploaded_docx2:
311
+ doc = Document(uploaded_docx2)
312
+ texts = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
313
 
314
+ # Count total posts
315
+ total_posts = sum(1 for t in texts if re.match(r"Post \d+", t))
 
316
 
317
+ # Process tone, language, and frames
318
+ tones = []
319
+ languages = []
320
+ frames_count = Counter()
321
+ frame_focus_count = Counter()
322
 
323
+ for text in texts:
324
+ detected_tones = extract_tone(text)
325
+ tones.extend(detected_tones)
326
+ detected_language = detect_language(text)
327
+ languages.append(detected_language)
328
+
329
+ frame_mapping = get_frame_category_mapping(text)
330
+ for frame, category in frame_mapping.items():
331
+ frames_count[frame] += 1
332
+ frame_focus_count[category] += 1
333
 
334
+ # Generate Summary
335
+ summary = f"Total Posts: {total_posts}\n"
336
+ summary += f"Detected Tones: {Counter(tones)}\n"
337
+ summary += f"Languages Used: {Counter(languages)}\n"
338
+ summary += f"Frame Distribution: {frames_count}\n"
339
+ summary += f"Frame Focus Levels: {frame_focus_count}\n"
340
+
341
+ st.write("## Document Summary")
342
+ st.text(summary)
343
+
344
+ # Create an Excel file
345
+ df = pd.DataFrame({
346
+ "Frame": list(frames_count.keys()),
347
+ "Count": list(frames_count.values()),
348
+ "Major Focus": [frame_focus_count.get("Major Focus", 0)] * len(frames_count),
349
+ "Significant Focus": [frame_focus_count.get("Significant Focus", 0)] * len(frames_count),
350
+ "Minor Mention": [frame_focus_count.get("Minor Mention", 0)] * len(frames_count),
351
+ "Not Applicable": [frame_focus_count.get("Not Applicable", 0)] * len(frames_count),
352
+ })
353
+
354
+ excel_io = io.BytesIO()
355
+ with pd.ExcelWriter(excel_io, engine='xlsxwriter') as writer:
356
+ df.to_excel(writer, index=False, sheet_name='Frame Analysis')
357
+ excel_io.seek(0)
358
+
359
+ st.download_button("Download Analysis as Excel", data=excel_io, file_name="document_analysis.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")