File size: 20,141 Bytes
80f6eb6
 
 
 
 
 
 
ba880a7
 
80f6eb6
ba880a7
 
80f6eb6
ba880a7
 
 
 
e71a397
80f6eb6
 
 
 
ba880a7
 
 
 
 
 
80f6eb6
 
 
b82621e
 
ba880a7
 
 
80f6eb6
 
 
b797eeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6652627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2102588
b82621e
 
 
 
8106243
b82621e
 
 
 
 
 
 
 
 
 
52f21c1
b82621e
 
 
 
 
 
 
8106243
 
 
 
b82621e
b797eeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80f6eb6
b797eeb
80f6eb6
b797eeb
 
 
 
 
 
ba880a7
 
b797eeb
 
 
ba880a7
b797eeb
ba880a7
 
 
 
 
 
 
 
 
 
 
b797eeb
ba880a7
 
 
 
 
 
 
 
 
 
 
 
 
b797eeb
ba880a7
 
 
b797eeb
ba880a7
 
 
 
 
 
 
 
b797eeb
ba880a7
 
 
 
 
 
80f6eb6
b797eeb
 
 
ba880a7
6652627
ba880a7
6652627
 
 
b797eeb
 
80f6eb6
b797eeb
6652627
 
8106243
80f6eb6
b797eeb
 
 
6652627
 
 
 
 
 
8106243
 
 
80f6eb6
8106243
b797eeb
ba880a7
6652627
 
 
 
 
 
 
8106243
 
ba880a7
b797eeb
8106243
 
b797eeb
 
 
8106243
b797eeb
 
8106243
b797eeb
 
 
6652627
 
 
 
 
 
 
 
8106243
 
 
b797eeb
8106243
b797eeb
 
 
 
 
8106243
6652627
b298046
 
6652627
b298046
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0eff4d9
b797eeb
b298046
8106243
b298046
 
 
 
 
 
 
0eff4d9
6652627
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
import os
import pandas as pd
import streamlit as st
import re
import logging
import nltk
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import io
from langdetect import detect
from collections import Counter
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from transformers import pipeline
from groq import Groq

# Load environment variables
load_dotenv()

# Check if Groq API key is available
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    logging.error("Missing Groq API key. Please set the GROQ_API_KEY environment variable.")
    st.error("API key is missing. Please provide a valid API key.")

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

groq_client = Groq(api_key=GROQ_API_KEY)

# Initialize LLM (Groq API)
llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192")

# Download required NLTK resources
nltk.download("punkt")

# Tone categories for fallback method
tone_categories = {
    "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
    "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
    "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
    "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
    "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
    "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
    "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
    "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
    "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
    "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
}

# Frame categories for fallback method
frame_categories = {
    "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
    "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
    "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
    "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
    "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
    "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
    "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
    "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
    "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
    "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
    "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
    "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
    "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
    "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
    "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
}

# Detect language
def detect_language(text):
    try:
        return detect(text)
    except Exception as e:
        logging.error(f"Error detecting language: {e}")
        return "unknown"

# Extract tone using Groq API (or fallback method)
def extract_tone(text):
    try:
        response = llm.chat([
            {"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
            {"role": "user", "content": text}
        ])
        return response["choices"][0]["message"]["content"].split(", ")
    except Exception as e:
        logging.error(f"Groq API error: {e}")
        return extract_tone_fallback(text)

# Fallback method for tone extraction
def extract_tone_fallback(text):
    detected_tones = set()
    text_lower = text.lower()
    for category, keywords in tone_categories.items():
        if any(word in text_lower for word in keywords):
            detected_tones.add(category)
    return list(detected_tones) if detected_tones else ["Neutral"]

# Extract hashtags
def extract_hashtags(text):
    return re.findall(r"#\w+", text)

# -------------------------------------------------------------------
# New functions for frame categorization and display
# -------------------------------------------------------------------

def get_frame_category_mapping(text):
    """
    Returns a mapping of every frame (from frame_categories) to one of the four categories.
    Detected frames are assigned a focus level based on keyword frequency:
      - Top detected: "Major Focus"
      - Next up to two: "Significant Focus"
      - Remaining detected frames: "Minor Mention"
    Frames not detected get "Not Applicable".
    """
    text_lower = text.lower()
    # Calculate frequency for each frame
    frame_freq = {}
    for frame, keywords in frame_categories.items():
        freq = sum(1 for word in keywords if word in text_lower)
        frame_freq[frame] = freq

    # Identify detected frames (frequency > 0) and sort descending
    detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
    detected.sort(key=lambda x: x[1], reverse=True)

    category_mapping = {}
    if detected:
        # Highest frequency frame as Major Focus
        category_mapping[detected[0][0]] = "Major Focus"
        # Next up to two frames as Significant Focus
        for frame, _ in detected[1:3]:
            category_mapping[frame] = "Significant Focus"
        # Remaining detected frames as Minor Mention
        for frame, _ in detected[3:]:
            category_mapping[frame] = "Minor Mention"
    # For frames not detected, assign Not Applicable
    for frame in frame_categories.keys():
        if frame not in category_mapping:
            category_mapping[frame] = "Not Applicable"
    return category_mapping

def format_frame_categories_table(category_mapping):
    """
    Returns a markdown-formatted table displaying each frame with columns:
    Major Focus, Significant Focus, Minor Mention, and Not Applicable.
    A tick (✓) marks the assigned category.
    """
    header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
    header += "| --- | --- | --- | --- | --- |\n"
    tick = "✓"
    rows = ""
    for frame, category in category_mapping.items():
        major = tick if category == "Major Focus" else ""
        significant = tick if category == "Significant Focus" else ""
        minor = tick if category == "Minor Mention" else ""
        not_applicable = tick if category == "Not Applicable" else ""
        rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n"
    return header + rows

# -------------------------------------------------------------------
# Existing functions for file processing
# -------------------------------------------------------------------

def extract_captions_from_docx(docx_file):
    doc = Document(docx_file)
    captions = {}
    current_post = None
    for para in doc.paragraphs:
        text = para.text.strip()
        if re.match(r"Post \d+", text, re.IGNORECASE):
            current_post = text
            captions[current_post] = []
        elif current_post:
            captions[current_post].append(text)
    return {post: " ".join(lines) for post, lines in captions.items() if lines}

def extract_metadata_from_excel(excel_file):
    try:
        df = pd.read_excel(excel_file)
        extracted_data = df.to_dict(orient="records")
        return extracted_data
    except Exception as e:
        logging.error(f"Error processing Excel file: {e}")
        return []

def merge_metadata_with_generated_data(generated_data, excel_metadata):
    for post_data in excel_metadata:
        post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
        if post_number in generated_data:
            generated_data[post_number].update(post_data)
        else:
            generated_data[post_number] = post_data  
    return generated_data

def extract_frame_focus(text):
    text_lower = text.lower()
    frame_freq = {}
    for frame, keywords in frame_categories.items():
        freq = sum(1 for word in keywords if word in text_lower)
        frame_freq[frame] = freq
    detected = sorted(frame_freq.items(), key=lambda x: x[1], reverse=True)
    frame_mapping_1 = {}
    if detected:
        frame_mapping_1[detected[0][0]] = "Major Focus"
        for frame, _ in detected[1:3]:
            frame_mapping_1[frame] = "Significant Focus"
        for frame, _ in detected[3:]:
            frame_mapping_1[frame] = "Minor Mention"
    for frame in frame_categories.keys():
        if frame not in frame_mapping_1:
            frame_mapping_1[frame] = "Not Applicable"
    return frame_mapping_1

def generate_abstract(text):
    """
    Generates an abstract and recommendations for the given document text
    using a Groq model.
    """
    try:
        # Define the prompt template using LangChain's ChatPromptTemplate.
        # Here we set a system message instructing the model and a placeholder
        # for the user-provided document.
        template = ChatPromptTemplate.from_messages(
            [
                ("system", "Generate an abstract and recommendations for the following document."),
                ("human", "{document}")
            ]
        )
        # Format the prompt with the actual document text
        formatted_prompt = template.format(document=text).to_string()
        
        # Get the response from the Groq API using the designated model.
        response = groq_client.get_completion(
            model="llama2-70b-chat",
            prompt=formatted_prompt
        )
        return response
    except Exception as e:
        logging.error(f"Groq API error: {e}")
        return "Abstract generation failed."


def create_docx_from_data(extracted_data):
    doc = Document()
    for post_number, data in extracted_data.items():
        doc.add_heading(post_number, level=1)
        ordered_keys = [
            "Post Number", "Date of Post", "Media Type", "Number of Pictures",
            "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
            "Full Caption", "Language", "Tone", "Hashtags"
        ]
        for key in ordered_keys:
            value = data.get(key, "N/A")
            if key in ["Tone", "Hashtags"]:
                value = ", ".join(value) if isinstance(value, list) else value
            para = doc.add_paragraph()
            run = para.add_run(f"**{key}:** {value}")
            run.font.size = Pt(11)
        # Add a proper table for Frames if a mapping is available.
        if "FramesMapping" in data:
            doc.add_paragraph("Frames:")
            mapping = data["FramesMapping"]
            table = doc.add_table(rows=1, cols=5)
            table.style = "Light List Accent 1"
            hdr_cells = table.rows[0].cells
            hdr_cells[0].text = "Frame"
            hdr_cells[1].text = "Major Focus"
            hdr_cells[2].text = "Significant Focus"
            hdr_cells[3].text = "Minor Mention"
            hdr_cells[4].text = "Not Applicable"
            tick = "✓"
            for frame, category in mapping.items():
                row_cells = table.add_row().cells
                row_cells[0].text = frame
                row_cells[1].text = tick if category == "Major Focus" else ""
                row_cells[2].text = tick if category == "Significant Focus" else ""
                row_cells[3].text = tick if category == "Minor Mention" else ""
                row_cells[4].text = tick if category == "Not Applicable" else ""
        else:
            value = data.get("Frames", "N/A")
            doc.add_paragraph(f"**Frames:** {value}")
        doc.add_paragraph("\n")
    return doc

# -------------------------------------------------------------------
# Streamlit App UI with Tabs
# -------------------------------------------------------------------

st.title("AI-Powered Coding Sheet Generator")
st.write("Enter text or upload a DOCX/Excel file for analysis:")

# Create tabs for Standard Analysis and Detailed Analysis
tabs = st.tabs(["Standard Analysis", "Detailed Analysis"])

# -------------------------------------------------------------------
# Standard Analysis Tab
# -------------------------------------------------------------------
with tabs[0]:
    input_text = st.text_area("Input Text", height=200)
    uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"], key="std_docx")
    uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"], key="std_excel")

    output_data = {}

    if input_text:
        frame_mapping = get_frame_category_mapping(input_text)
        frames_table = format_frame_categories_table(frame_mapping)
        output_data["Manual Input"] = {
            "Full Caption": input_text,
            "Language": detect_language(input_text),
            "Tone": extract_tone(input_text),
            "Hashtags": extract_hashtags(input_text),
            "Frames": frames_table,
            "FramesMapping": frame_mapping
        }

    if uploaded_docx:
        captions = extract_captions_from_docx(uploaded_docx)
        for caption, text in captions.items():
            frame_mapping = get_frame_category_mapping(text)
            frames_table = format_frame_categories_table(frame_mapping)
            output_data[caption] = {
                "Full Caption": text,
                "Language": detect_language(text),
                "Tone": extract_tone(text),
                "Hashtags": extract_hashtags(text),
                "Frames": frames_table,
                "FramesMapping": frame_mapping
            }

    if uploaded_excel:
        excel_metadata = extract_metadata_from_excel(uploaded_excel)
        output_data = merge_metadata_with_generated_data(output_data, excel_metadata)

    if output_data:
        for post_number, data in output_data.items():
            with st.expander(post_number):
                for key, value in data.items():
                    if key == "Frames":
                        st.markdown(f"**{key}:**\n{value}")
                    else:
                        st.write(f"**{key}:** {value}")

    if output_data:
        docx_output = create_docx_from_data(output_data)
        docx_io = io.BytesIO()
        docx_output.save(docx_io)
        docx_io.seek(0)
        st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")

# -------------------------------------------------------------------
# Detailed Analysis Tab
# -------------------------------------------------------------------
with tabs[1]:
    st.title("Detailed DOCX Analysis")
    
    uploaded_docx = st.file_uploader("Upload DOCX file", type=["docx"])
    if uploaded_docx:
        captions = extract_captions_from_docx(uploaded_docx)
        total_posts = len(captions)
        st.write(f"**Total number of posts:** {total_posts}")
        
        language_counter = Counter()
        tone_counter = Counter()
        frame_counter = {frame: Counter() for frame in frame_categories.keys()}
        hashtag_counter = Counter()
        
        for post, text in captions.items():
            lang = detect_language(text)
            language_counter[lang] += 1
            tones = extract_tone(text)
            for tone in tones:
                tone_counter[tone] += 1
            frame_mapping_1 = extract_frame_focus(text)
            for frame, category in frame_mapping_1.items():
                frame_counter[frame][category] += 1
            hashtags = extract_hashtags(text)
            for hashtag in hashtags:
                hashtag_counter[hashtag] += 1
        
        st.subheader("Language Distribution")
        st.write(dict(language_counter))
        
        st.subheader("Tone Distribution")
        st.write(dict(tone_counter))
        
        st.subheader("Frame Distribution")
        for frame, counts in frame_counter.items():
            st.write(f"**{frame}:** {dict(counts)}")
        
        st.subheader("Hashtag Distribution")
        st.write(dict(hashtag_counter))
        
        combined_text = " ".join(captions.values())
        abstract = generate_abstract(combined_text)
        st.subheader("Abstract & Recommendations")
        st.write(abstract)
        
        doc = Document()
        doc.add_heading("Analysis Summary", 0)
        doc.add_paragraph(f"Total number of posts: {total_posts}")
        
        doc.add_heading("Language Distribution", level=1)
        for lang, count in language_counter.items():
            doc.add_paragraph(f"{lang}: {count}")
        
        doc.add_heading("Tone Distribution", level=1)
        for tone, count in tone_counter.items():
            doc.add_paragraph(f"{tone}: {count}")
        
        doc.add_heading("Frame Distribution", level=1)
        for frame, counts in frame_counter.items():
            doc.add_paragraph(f"{frame}: {dict(counts)}")
        
        doc.add_heading("Hashtag Distribution", level=1)
        for hashtag, count in hashtag_counter.items():
            doc.add_paragraph(f"{hashtag}: {count}")
        
        doc.add_heading("Abstract & Recommendations", level=1)
        doc.add_paragraph(abstract)
        
        docx_io = io.BytesIO()
        doc.save(docx_io)
        docx_io.seek(0)
        st.download_button("Download Analysis Summary as DOCX", data=docx_io, file_name="analysis_summary.docx")
    
        # Create an in-memory Excel file
        excel_io = io.BytesIO()
    
        with pd.ExcelWriter(excel_io, engine="xlsxwriter") as writer:
            # Language Distribution sheet
            df_language = pd.DataFrame(list(language_counter.items()), columns=["Language", "Count"])
            df_language.to_excel(writer, index=False, sheet_name="Language Distribution")
            
            # Tone Distribution sheet
            df_tone = pd.DataFrame(list(tone_counter.items()), columns=["Tone", "Count"])
            df_tone.to_excel(writer, index=False, sheet_name="Tone Distribution")
            
            # Frame Distribution sheet
            # Convert the nested dictionary (frame_counter) into a DataFrame
            df_frame = pd.DataFrame.from_dict({frame: dict(counter) for frame, counter in frame_counter.items()}, orient="index").fillna(0).astype(int)
            df_frame.reset_index(inplace=True)
            df_frame.rename(columns={"index": "Frame"}, inplace=True)
            df_frame.to_excel(writer, index=False, sheet_name="Frame Distribution")
            
            # Hashtag Distribution sheet
            df_hashtag = pd.DataFrame(list(hashtag_counter.items()), columns=["Hashtag", "Count"])
            df_hashtag.to_excel(writer, index=False, sheet_name="Hashtag Distribution")
            
            # Abstract & Recommendations sheet
            df_abstract = pd.DataFrame({"Abstract & Recommendations": [abstract]})
            df_abstract.to_excel(writer, index=False, sheet_name="Abstract")
            
            writer.close()
        
        excel_io.seek(0)
        
        # Download button for the Excel file
        st.download_button(
            label="Download Analysis Data as Excel",
            data=excel_io,
            file_name="analysis_data.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )