AA_F6

Build error

File size: 8,206 Bytes

0d3d327
f44d7de
706fc89
 
34d7c10
23eb166
706fc89
eb89420
e465fa1
609d4a9
34d7c10
0d3d327
34d7c10
ac7b5dd
 
bc77227
 
 
34d7c10
 
 
 
 
 
 
ac7b5dd
 
 
985e391
bc77227
 
 
c6508c5
bc77227
56ea0c4
8b7cb50
56ea0c4
ac7b5dd
bc77227
 
 
 
 
 
 
8b7cb50
bc77227
 
8b7cb50
 
bc77227
 
ac7b5dd
bc77227
ac7b5dd
8b7cb50
bc77227
2ebce6c
 
 
ac7b5dd
56ea0c4
ac7b5dd
 
56ea0c4
ac7b5dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4070da2
ac7b5dd
4070da2
ac7b5dd
 
 
 
 
 
 
 
 
 
4070da2
ac7b5dd
 
 
706fc89
 
 
34d7c10
 
706fc89
 
609d4a9
 
 
706fc89
 
 
 
 
 
 
 
609d4a9
706fc89
609d4a9
 
706fc89
da716d7
 
 
 
 
 
 
 
5893c88
da716d7
 
 
 
 
 
bba1b37
da716d7
5893c88
bba1b37
 
 
 
 
 
 
ac7b5dd
bba1b37
 
 
ac7b5dd
5856ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3bb165
5856ee7

import os
import pandas as pd
import streamlit as st
import re
import logging
import nltk
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import io
from langdetect import detect
from collections import Counter
from dotenv import load_dotenv
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load environment variables
load_dotenv()

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Download required NLTK resources
nltk.download("punkt")
nltk.download("stopwords")

# Load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

def extract_keywords_with_bert(text, top_n=15):
    """
    Extracts keywords using BERT embeddings and clustering.
    """
    try:
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Get model predictions
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract the embeddings
        embeddings = outputs.logits.squeeze().cpu().numpy()
        
        # Example: Top keywords extraction (for demo purposes, replace with clustering logic)
        keywords = ["Keyword1", "Keyword2", "Keyword3"]  # Replace this with actual extraction logic
        
        return keywords[:top_n]
    except Exception as e:
        logging.error(f"BERT keyword extraction error: {e}")
        return []

        
# -------------------------------------------------------------------
# New Functions for Theme Assignment and Frame Assignment
# -------------------------------------------------------------------
def assign_themes(keywords):
    """
    Assigns one or more themes based on the extracted keywords.
    Each theme has an associated list of terms.
    """
    theme_mapping = {
        "Social Justice": ["inequality", "activism", "rights", "justice", "protest"],
        "Environmental": ["climate", "pollution", "sustainability", "deforestation", "environment"],
        "Political": ["government", "policy", "election", "politics", "reform"],
        "Economic": ["economy", "finance", "market", "investment", "trade"],
        "Technological": ["technology", "ai", "machine learning", "innovation", "digital"]
    }
    # Score each theme based on how many of its keywords appear in the extracted keywords.
    theme_scores = {theme: 0 for theme in theme_mapping}
    for kw in keywords:
        kw_lower = kw.lower()
        for theme, terms in theme_mapping.items():
            if any(term in kw_lower for term in terms):
                theme_scores[theme] += 1
    max_score = max(theme_scores.values())
    if max_score == 0:
        return ["General"]
    # Return all themes that reached the maximum score.
    assigned = [theme for theme, score in theme_scores.items() if score == max_score]
    return assigned

def assign_frames(themes):
    """
    Maps the assigned themes to frames.
    """
    theme_to_frame = {
        "Social Justice": "Human Rights & Justice",
        "Environmental": "Environmental Crisis & Activism",
        "Political": "Political & State Accountability",
        "Economic": "Social Inequality & Economic Disparities",
        "Technological": "Activism & Advocacy",
        "General": "Informative"
    }
    frames = [theme_to_frame.get(theme, "Not Applicable") for theme in themes]
    return frames

# -------------------------------------------------------------------
# Other Functions (Language Detection, Hashtags, DOCX/Excel Processing, etc.)
# -------------------------------------------------------------------
def detect_language(text):
    try:
        return detect(text)
    except Exception as e:
        logging.error(f"Error detecting language: {e}")
        return "unknown"

def extract_hashtags(text):
    return re.findall(r"#\w+", text)

def extract_captions_from_docx(docx_file):
    doc = Document(docx_file)
    captions = {}
    current_post = None
    for para in doc.paragraphs:
        text = para.text.strip()
        if re.match(r"Post \d+", text, re.IGNORECASE):
            current_post = text
            captions[current_post] = []
        elif current_post:
            captions[current_post].append(text)
    return {post: " ".join(lines) for post, lines in captions.items() if lines}

def extract_metadata_from_excel(excel_file):
    try:
        df = pd.read_excel(excel_file)
        extracted_data = df.to_dict(orient="records")
        return extracted_data
    except Exception as e:
        logging.error(f"Error processing Excel file: {e}")
        return []

def merge_metadata_with_generated_data(generated_data, excel_metadata):
    for post_data in excel_metadata:
        post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
        if post_number in generated_data:
            generated_data[post_number].update(post_data)
        else:
            generated_data[post_number] = post_data  
    return generated_data

def create_docx_from_data(extracted_data):
    doc = Document()
    for post_number, data in extracted_data.items():
        doc.add_heading(post_number, level=1)
        ordered_keys = [
            "Post Number", "Date of Post", "Media Type", "Number of Pictures",
            "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
            "Full Caption", "Language", "Tone", "Hashtags", "Keywords", "Themes", "Frames"
        ]
        for key in ordered_keys:
            value = data.get(key, "N/A")
            # If the value is a list, join it into a string
            if isinstance(value, list):
                value = ", ".join(value)
            para = doc.add_paragraph()
            run = para.add_run(f"**{key}:** {value}")
            run.font.size = Pt(11)
        doc.add_paragraph("\n")
    return doc

# -------------------------------------------------------------------
# Streamlit App UI
# -------------------------------------------------------------------
st.title("AI-Powered Coding Sheet Generator")
st.write("Enter text or upload a DOCX/Excel file for analysis:")

input_text = st.text_area("Input Text", height=200)
uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])

output_data = {}

if input_text:
    # Keyword extraction using TextRank
    keywords = extract_keywords_with_bert(input_text)
    themes = assign_themes(keywords)
    frames = assign_frames(themes)
    
    output_data["Manual Input"] = {
        "Full Caption": input_text,
        "Language": detect_language(input_text),
        "Keywords": keywords,
        "Themes": themes,
        "Frames": frames,
        "Hashtags": extract_hashtags(input_text)
    }

if uploaded_docx:
    captions = extract_captions_from_docx(uploaded_docx)
    for caption, text in captions.items():
        keywords = extract_keywords_textrank(text)
        themes = assign_themes(keywords)
        frames = assign_frames(themes)
        
        output_data[caption] = {
            "Full Caption": text,
            "Language": detect_language(text),
            "Keywords": keywords,
            "Themes": themes,
            "Frames": frames,
            "Hashtags": extract_hashtags(text)
        }

if uploaded_excel:
    excel_metadata = extract_metadata_from_excel(uploaded_excel)
    output_data = merge_metadata_with_generated_data(output_data, excel_metadata)

if output_data:
    for post_number, data in output_data.items():
        with st.expander(post_number):
            for key, value in data.items():
                st.write(f"**{key}:** {value}")

if output_data:
    docx_output = create_docx_from_data(output_data)
    docx_io = io.BytesIO()
    docx_output.save(docx_io)
    docx_io.seek(0)
    st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")