File size: 8,206 Bytes
0d3d327
f44d7de
706fc89
 
34d7c10
23eb166
706fc89
eb89420
e465fa1
609d4a9
34d7c10
0d3d327
34d7c10
ac7b5dd
 
bc77227
 
 
34d7c10
 
 
 
 
 
 
ac7b5dd
 
 
985e391
bc77227
 
 
c6508c5
bc77227
56ea0c4
8b7cb50
56ea0c4
ac7b5dd
bc77227
 
 
 
 
 
 
8b7cb50
bc77227
 
8b7cb50
 
bc77227
 
ac7b5dd
bc77227
ac7b5dd
8b7cb50
bc77227
2ebce6c
 
 
ac7b5dd
56ea0c4
ac7b5dd
 
56ea0c4
ac7b5dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4070da2
ac7b5dd
4070da2
ac7b5dd
 
 
 
 
 
 
 
 
 
4070da2
ac7b5dd
 
 
706fc89
 
 
34d7c10
 
706fc89
 
609d4a9
 
 
706fc89
 
 
 
 
 
 
 
609d4a9
706fc89
609d4a9
 
706fc89
da716d7
 
 
 
 
 
 
 
5893c88
da716d7
 
 
 
 
 
bba1b37
da716d7
5893c88
bba1b37
 
 
 
 
 
 
ac7b5dd
bba1b37
 
 
ac7b5dd
5856ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3bb165
5856ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import os
import pandas as pd
import streamlit as st
import re
import logging
import nltk
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import io
from langdetect import detect
from collections import Counter
from dotenv import load_dotenv
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load environment variables
load_dotenv()

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Download required NLTK resources
nltk.download("punkt")
nltk.download("stopwords")

# Load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

def extract_keywords_with_bert(text, top_n=15):
    """
    Extracts keywords using BERT embeddings and clustering.
    """
    try:
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Get model predictions
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract the embeddings
        embeddings = outputs.logits.squeeze().cpu().numpy()
        
        # Example: Top keywords extraction (for demo purposes, replace with clustering logic)
        keywords = ["Keyword1", "Keyword2", "Keyword3"]  # Replace this with actual extraction logic
        
        return keywords[:top_n]
    except Exception as e:
        logging.error(f"BERT keyword extraction error: {e}")
        return []

        
# -------------------------------------------------------------------
# New Functions for Theme Assignment and Frame Assignment
# -------------------------------------------------------------------
def assign_themes(keywords):
    """
    Assigns one or more themes based on the extracted keywords.
    Each theme has an associated list of terms.
    """
    theme_mapping = {
        "Social Justice": ["inequality", "activism", "rights", "justice", "protest"],
        "Environmental": ["climate", "pollution", "sustainability", "deforestation", "environment"],
        "Political": ["government", "policy", "election", "politics", "reform"],
        "Economic": ["economy", "finance", "market", "investment", "trade"],
        "Technological": ["technology", "ai", "machine learning", "innovation", "digital"]
    }
    # Score each theme based on how many of its keywords appear in the extracted keywords.
    theme_scores = {theme: 0 for theme in theme_mapping}
    for kw in keywords:
        kw_lower = kw.lower()
        for theme, terms in theme_mapping.items():
            if any(term in kw_lower for term in terms):
                theme_scores[theme] += 1
    max_score = max(theme_scores.values())
    if max_score == 0:
        return ["General"]
    # Return all themes that reached the maximum score.
    assigned = [theme for theme, score in theme_scores.items() if score == max_score]
    return assigned

def assign_frames(themes):
    """
    Maps the assigned themes to frames.
    """
    theme_to_frame = {
        "Social Justice": "Human Rights & Justice",
        "Environmental": "Environmental Crisis & Activism",
        "Political": "Political & State Accountability",
        "Economic": "Social Inequality & Economic Disparities",
        "Technological": "Activism & Advocacy",
        "General": "Informative"
    }
    frames = [theme_to_frame.get(theme, "Not Applicable") for theme in themes]
    return frames

# -------------------------------------------------------------------
# Other Functions (Language Detection, Hashtags, DOCX/Excel Processing, etc.)
# -------------------------------------------------------------------
def detect_language(text):
    try:
        return detect(text)
    except Exception as e:
        logging.error(f"Error detecting language: {e}")
        return "unknown"

def extract_hashtags(text):
    return re.findall(r"#\w+", text)

def extract_captions_from_docx(docx_file):
    doc = Document(docx_file)
    captions = {}
    current_post = None
    for para in doc.paragraphs:
        text = para.text.strip()
        if re.match(r"Post \d+", text, re.IGNORECASE):
            current_post = text
            captions[current_post] = []
        elif current_post:
            captions[current_post].append(text)
    return {post: " ".join(lines) for post, lines in captions.items() if lines}

def extract_metadata_from_excel(excel_file):
    try:
        df = pd.read_excel(excel_file)
        extracted_data = df.to_dict(orient="records")
        return extracted_data
    except Exception as e:
        logging.error(f"Error processing Excel file: {e}")
        return []

def merge_metadata_with_generated_data(generated_data, excel_metadata):
    for post_data in excel_metadata:
        post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
        if post_number in generated_data:
            generated_data[post_number].update(post_data)
        else:
            generated_data[post_number] = post_data  
    return generated_data

def create_docx_from_data(extracted_data):
    doc = Document()
    for post_number, data in extracted_data.items():
        doc.add_heading(post_number, level=1)
        ordered_keys = [
            "Post Number", "Date of Post", "Media Type", "Number of Pictures",
            "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
            "Full Caption", "Language", "Tone", "Hashtags", "Keywords", "Themes", "Frames"
        ]
        for key in ordered_keys:
            value = data.get(key, "N/A")
            # If the value is a list, join it into a string
            if isinstance(value, list):
                value = ", ".join(value)
            para = doc.add_paragraph()
            run = para.add_run(f"**{key}:** {value}")
            run.font.size = Pt(11)
        doc.add_paragraph("\n")
    return doc

# -------------------------------------------------------------------
# Streamlit App UI
# -------------------------------------------------------------------
st.title("AI-Powered Coding Sheet Generator")
st.write("Enter text or upload a DOCX/Excel file for analysis:")

input_text = st.text_area("Input Text", height=200)
uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])

output_data = {}

if input_text:
    # Keyword extraction using TextRank
    keywords = extract_keywords_with_bert(input_text)
    themes = assign_themes(keywords)
    frames = assign_frames(themes)
    
    output_data["Manual Input"] = {
        "Full Caption": input_text,
        "Language": detect_language(input_text),
        "Keywords": keywords,
        "Themes": themes,
        "Frames": frames,
        "Hashtags": extract_hashtags(input_text)
    }

if uploaded_docx:
    captions = extract_captions_from_docx(uploaded_docx)
    for caption, text in captions.items():
        keywords = extract_keywords_textrank(text)
        themes = assign_themes(keywords)
        frames = assign_frames(themes)
        
        output_data[caption] = {
            "Full Caption": text,
            "Language": detect_language(text),
            "Keywords": keywords,
            "Themes": themes,
            "Frames": frames,
            "Hashtags": extract_hashtags(text)
        }

if uploaded_excel:
    excel_metadata = extract_metadata_from_excel(uploaded_excel)
    output_data = merge_metadata_with_generated_data(output_data, excel_metadata)

if output_data:
    for post_number, data in output_data.items():
        with st.expander(post_number):
            for key, value in data.items():
                st.write(f"**{key}:** {value}")

if output_data:
    docx_output = create_docx_from_data(output_data)
    docx_io = io.BytesIO()
    docx_output.save(docx_io)
    docx_io.seek(0)
    st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")