ahm14 commited on
Commit
d5c0fd1
·
verified ·
1 Parent(s): fe89b54

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +297 -0
app.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import pandas as pd
4
+ import streamlit as st
5
+ import re
6
+ import logging
7
+ import nltk
8
+ from docx import Document
9
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
10
+ from docx.shared import Pt
11
+ import io
12
+ from langdetect import detect
13
+ from collections import Counter
14
+ from dotenv import load_dotenv
15
+ from langchain_groq import ChatGroq
16
+ from langchain_core.output_parsers import StrOutputParser
17
+ from langchain_core.prompts import ChatPromptTemplate
18
+ from transformers import pipeline
19
+
20
+ # Load environment variables
21
+ load_dotenv()
22
+
23
+ # Check if Groq API key is available
24
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
25
+ if not GROQ_API_KEY:
26
+ logging.error("Missing Groq API key. Please set the GROQ_API_KEY environment variable.")
27
+ st.error("API key is missing. Please provide a valid API key.")
28
+
29
+ # Initialize logging
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
31
+
32
+ # Initialize LLM (Groq API)
33
+ llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192")
34
+
35
+ # Download required NLTK resources
36
+ nltk.download("punkt")
37
+
38
+ # Tone categories for fallback method
39
+ tone_categories = {
40
+ "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
41
+ "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
42
+ "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
43
+ "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
44
+ "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
45
+ "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
46
+ "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
47
+ "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
48
+ "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
49
+ "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
50
+ }
51
+
52
+ # Frame categories for fallback method
53
+ frame_categories = {
54
+ "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
55
+ "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
56
+ "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
57
+ "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
58
+ "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
59
+ "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
60
+ "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
61
+ "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
62
+ "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
63
+ "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
64
+ "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
65
+ "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
66
+ "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
67
+ "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
68
+ "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
69
+ }
70
+
71
+ # Detect language
72
+ def detect_language(text):
73
+ try:
74
+ return detect(text)
75
+ except Exception as e:
76
+ logging.error(f"Error detecting language: {e}")
77
+ return "unknown"
78
+
79
+ # Extract tone using Groq API (or fallback method)
80
+ def extract_tone(text):
81
+ try:
82
+ response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
83
+ {"role": "user", "content": text}])
84
+ return response["choices"][0]["message"]["content"].split(", ")
85
+ except Exception as e:
86
+ logging.error(f"Groq API error: {e}")
87
+ return extract_tone_fallback(text)
88
+
89
+ # Fallback method for tone extraction
90
+ def extract_tone_fallback(text):
91
+ detected_tones = set()
92
+ text_lower = text.lower()
93
+ for category, keywords in tone_categories.items():
94
+ if any(word in text_lower for word in keywords):
95
+ detected_tones.add(category)
96
+ return list(detected_tones) if detected_tones else ["Neutral"]
97
+
98
+ # Extract hashtags
99
+ def extract_hashtags(text):
100
+ return re.findall(r"#\w+", text)
101
+
102
+ # -------------------------------------------------------------------
103
+ # New functions for frame categorization and display
104
+ # -------------------------------------------------------------------
105
+
106
+ def get_frame_category_mapping(text):
107
+ """
108
+ Returns a mapping of every frame (from frame_categories) to one of the four categories.
109
+ Detected frames are assigned a focus level based on keyword frequency:
110
+ - Top detected: "Major Focus"
111
+ - Next up to two: "Significant Focus"
112
+ - Remaining detected: "Minor Mention"
113
+ Frames not detected get "Not Applicable".
114
+ """
115
+ text_lower = text.lower()
116
+ # Calculate frequency for each frame
117
+ frame_freq = {}
118
+ for frame, keywords in frame_categories.items():
119
+ freq = sum(1 for word in keywords if word in text_lower)
120
+ frame_freq[frame] = freq
121
+
122
+ # Identify detected frames (frequency > 0) and sort descending
123
+ detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
124
+ detected.sort(key=lambda x: x[1], reverse=True)
125
+
126
+ category_mapping = {}
127
+ if detected:
128
+ # Highest frequency frame as Major Focus
129
+ category_mapping[detected[0][0]] = "Major Focus"
130
+ # Next up to two frames as Significant Focus
131
+ for frame, _ in detected[1:3]:
132
+ category_mapping[frame] = "Significant Focus"
133
+ # Remaining detected frames as Minor Mention
134
+ for frame, _ in detected[3:]:
135
+ category_mapping[frame] = "Minor Mention"
136
+ # For frames not detected, assign Not Applicable
137
+ for frame in frame_categories.keys():
138
+ if frame not in category_mapping:
139
+ category_mapping[frame] = "Not Applicable"
140
+ return category_mapping
141
+
142
+ def format_frame_categories_table(category_mapping):
143
+ """
144
+ Returns a markdown-formatted table displaying each frame with columns:
145
+ Major Focus, Significant Focus, Minor Mention, and Not Applicable.
146
+ A tick (✓) marks the assigned category.
147
+ """
148
+ header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
149
+ header += "| --- | --- | --- | --- | --- |\n"
150
+ tick = "✓"
151
+ rows = ""
152
+ for frame, category in category_mapping.items():
153
+ major = tick if category == "Major Focus" else ""
154
+ significant = tick if category == "Significant Focus" else ""
155
+ minor = tick if category == "Minor Mention" else ""
156
+ not_applicable = tick if category == "Not Applicable" else ""
157
+ rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n"
158
+ return header + rows
159
+
160
+ # -------------------------------------------------------------------
161
+ # Existing functions for file processing
162
+ # -------------------------------------------------------------------
163
+
164
+ def extract_captions_from_docx(docx_file):
165
+ doc = Document(docx_file)
166
+ captions = {}
167
+ current_post = None
168
+ for para in doc.paragraphs:
169
+ text = para.text.strip()
170
+ if re.match(r"Post \d+", text, re.IGNORECASE):
171
+ current_post = text
172
+ captions[current_post] = []
173
+ elif current_post:
174
+ captions[current_post].append(text)
175
+ return {post: " ".join(lines) for post, lines in captions.items() if lines}
176
+
177
+ def extract_metadata_from_excel(excel_file):
178
+ try:
179
+ df = pd.read_excel(excel_file)
180
+ extracted_data = df.to_dict(orient="records")
181
+ return extracted_data
182
+ except Exception as e:
183
+ logging.error(f"Error processing Excel file: {e}")
184
+ return []
185
+
186
+ def merge_metadata_with_generated_data(generated_data, excel_metadata):
187
+ for post_data in excel_metadata:
188
+ post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
189
+ if post_number in generated_data:
190
+ generated_data[post_number].update(post_data)
191
+ else:
192
+ generated_data[post_number] = post_data
193
+ return generated_data
194
+
195
+ def create_docx_from_data(extracted_data):
196
+ doc = Document()
197
+ for post_number, data in extracted_data.items():
198
+ doc.add_heading(post_number, level=1)
199
+ ordered_keys = [
200
+ "Post Number", "Date of Post", "Media Type", "Number of Pictures",
201
+ "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
202
+ "Full Caption", "Language", "Tone", "Hashtags"
203
+ ]
204
+ for key in ordered_keys:
205
+ value = data.get(key, "N/A")
206
+ if key in ["Tone", "Hashtags"]:
207
+ value = ", ".join(value) if isinstance(value, list) else value
208
+ para = doc.add_paragraph()
209
+ run = para.add_run(f"**{key}:** {value}")
210
+ run.font.size = Pt(11)
211
+ # Add a proper table for Frames if a mapping is available.
212
+ if "FramesMapping" in data:
213
+ doc.add_paragraph("Frames:")
214
+ category_mapping = data["FramesMapping"]
215
+ table = doc.add_table(rows=1, cols=5)
216
+ table.style = "Light List Accent 1"
217
+ hdr_cells = table.rows[0].cells
218
+ hdr_cells[0].text = "Frame"
219
+ hdr_cells[1].text = "Major Focus"
220
+ hdr_cells[2].text = "Significant Focus"
221
+ hdr_cells[3].text = "Minor Mention"
222
+ hdr_cells[4].text = "Not Applicable"
223
+ tick = "✓"
224
+ for frame, category in category_mapping.items():
225
+ row_cells = table.add_row().cells
226
+ row_cells[0].text = frame
227
+ row_cells[1].text = tick if category == "Major Focus" else ""
228
+ row_cells[2].text = tick if category == "Significant Focus" else ""
229
+ row_cells[3].text = tick if category == "Minor Mention" else ""
230
+ row_cells[4].text = tick if category == "Not Applicable" else ""
231
+ else:
232
+ value = data.get("Frames", "N/A")
233
+ doc.add_paragraph(f"**Frames:** {value}")
234
+ doc.add_paragraph("\n")
235
+ return doc
236
+
237
+
238
+
239
+ # -------------------------------------------------------------------
240
+ # Streamlit App UI
241
+ # -------------------------------------------------------------------
242
+
243
+ st.title("AI-Powered Coding Sheet Generator")
244
+ st.write("Enter text or upload a DOCX/Excel file for analysis:")
245
+
246
+ input_text = st.text_area("Input Text", height=200)
247
+ uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
248
+ uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
249
+
250
+ output_data = {}
251
+
252
+ if input_text:
253
+ # Process manual input text
254
+ frame_mapping = get_frame_category_mapping(input_text)
255
+ frames_table = format_frame_categories_table(frame_mapping)
256
+ output_data["Manual Input"] = {
257
+ "Full Caption": input_text,
258
+ "Language": detect_language(input_text),
259
+ "Tone": extract_tone(input_text),
260
+ "Hashtags": extract_hashtags(input_text),
261
+ "Frames": frames_table, # Markdown table displaying frame categories
262
+ }
263
+
264
+ if uploaded_docx:
265
+ captions = extract_captions_from_docx(uploaded_docx)
266
+ for caption, text in captions.items():
267
+ frame_mapping = get_frame_category_mapping(text)
268
+ frames_table = format_frame_categories_table(frame_mapping)
269
+ output_data[caption] = {
270
+ "Full Caption": text,
271
+ "Language": detect_language(text),
272
+ "Tone": extract_tone(text),
273
+ "Hashtags": extract_hashtags(text),
274
+ "Frames": frames_table,
275
+ }
276
+
277
+ if uploaded_excel:
278
+ excel_metadata = extract_metadata_from_excel(uploaded_excel)
279
+ output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
280
+
281
+ # Display results in collapsible sections
282
+ if output_data:
283
+ for post_number, data in output_data.items():
284
+ with st.expander(post_number):
285
+ for key, value in data.items():
286
+ if key == "Frames":
287
+ st.markdown(f"**{key}:**\n{value}")
288
+ else:
289
+ st.write(f"**{key}:** {value}")
290
+
291
+ # Generate DOCX output for download
292
+ if output_data:
293
+ docx_output = create_docx_from_data(output_data)
294
+ docx_io = io.BytesIO()
295
+ docx_output.save(docx_io)
296
+ docx_io.seek(0)
297
+ st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")