Spaces:

ahm14
/

Summary_Generator

Sleeping

File size: 20,141 Bytes

import os
import pandas as pd
import streamlit as st
import re
import logging
import nltk
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import io
from langdetect import detect
from collections import Counter
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from transformers import pipeline
from groq import Groq

# Load environment variables
load_dotenv()

# Check if Groq API key is available
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    logging.error("Missing Groq API key. Please set the GROQ_API_KEY environment variable.")
    st.error("API key is missing. Please provide a valid API key.")

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

groq_client = Groq(api_key=GROQ_API_KEY)

# Initialize LLM (Groq API)
llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192")

# Download required NLTK resources
nltk.download("punkt")

# Tone categories for fallback method
tone_categories = {
    "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
    "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
    "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
    "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
    "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
    "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
    "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
    "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
    "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
    "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
}

# Frame categories for fallback method
frame_categories = {
    "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
    "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
    "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
    "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
    "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
    "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
    "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
    "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
    "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
    "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
    "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
    "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
    "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
    "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
    "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
}

# Detect language
def detect_language(text):
    try:
        return detect(text)
    except Exception as e:
        logging.error(f"Error detecting language: {e}")
        return "unknown"

# Extract tone using Groq API (or fallback method)
def extract_tone(text):
    try:
        response = llm.chat([
            {"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
            {"role": "user", "content": text}
        ])
        return response["choices"][0]["message"]["content"].split(", ")
    except Exception as e:
        logging.error(f"Groq API error: {e}")
        return extract_tone_fallback(text)

# Fallback method for tone extraction
def extract_tone_fallback(text):
    detected_tones = set()
    text_lower = text.lower()
    for category, keywords in tone_categories.items():
        if any(word in text_lower for word in keywords):
            detected_tones.add(category)
    return list(detected_tones) if detected_tones else ["Neutral"]

# Extract hashtags
def extract_hashtags(text):
    return re.findall(r"#\w+", text)

# -------------------------------------------------------------------
# New functions for frame categorization and display
# -------------------------------------------------------------------

def get_frame_category_mapping(text):
    """
    Returns a mapping of every frame (from frame_categories) to one of the four categories.
    Detected frames are assigned a focus level based on keyword frequency:
      - Top detected: "Major Focus"
      - Next up to two: "Significant Focus"
      - Remaining detected frames: "Minor Mention"
    Frames not detected get "Not Applicable".
    """
    text_lower = text.lower()
    # Calculate frequency for each frame
    frame_freq = {}
    for frame, keywords in frame_categories.items():
        freq = sum(1 for word in keywords if word in text_lower)
        frame_freq[frame] = freq

    # Identify detected frames (frequency > 0) and sort descending
    detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
    detected.sort(key=lambda x: x[1], reverse=True)

    category_mapping = {}
    if detected:
        # Highest frequency frame as Major Focus
        category_mapping[detected[0][0]] = "Major Focus"
        # Next up to two frames as Significant Focus
        for frame, _ in detected[1:3]:
            category_mapping[frame] = "Significant Focus"
        # Remaining detected frames as Minor Mention
        for frame, _ in detected[3:]:
            category_mapping[frame] = "Minor Mention"
    # For frames not detected, assign Not Applicable
    for frame in frame_categories.keys():
        if frame not in category_mapping:
            category_mapping[frame] = "Not Applicable"
    return category_mapping

def format_frame_categories_table(category_mapping):
    """
    Returns a markdown-formatted table displaying each frame with columns:
    Major Focus, Significant Focus, Minor Mention, and Not Applicable.
    A tick (✓) marks the assigned category.
    """
    header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
    header += "| --- | --- | --- | --- | --- |\n"
    tick = "✓"
    rows = ""
    for frame, category in category_mapping.items():
        major = tick if category == "Major Focus" else ""
        significant = tick if category == "Significant Focus" else ""
        minor = tick if category == "Minor Mention" else ""
        not_applicable = tick if category == "Not Applicable" else ""
        rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n"
    return header + rows

# -------------------------------------------------------------------
# Existing functions for file processing
# -------------------------------------------------------------------

def extract_captions_from_docx(docx_file):
    doc = Document(docx_file)
    captions = {}
    current_post = None
    for para in doc.paragraphs:
        text = para.text.strip()
        if re.match(r"Post \d+", text, re.IGNORECASE):
            current_post = text
            captions[current_post] = []
        elif current_post:
            captions[current_post].append(text)
    return {post: " ".join(lines) for post, lines in captions.items() if lines}

def extract_metadata_from_excel(excel_file):
    try:
        df = pd.read_excel(excel_file)
        extracted_data = df.to_dict(orient="records")
        return extracted_data
    except Exception as e:
        logging.error(f"Error processing Excel file: {e}")
        return []

def merge_metadata_with_generated_data(generated_data, excel_metadata):
    for post_data in excel_metadata:
        post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
        if post_number in generated_data:
            generated_data[post_number].update(post_data)
        else:
            generated_data[post_number] = post_data  
    return generated_data

def extract_frame_focus(text):
    text_lower = text.lower()
    frame_freq = {}
    for frame, keywords in frame_categories.items():
        freq = sum(1 for word in keywords if word in text_lower)
        frame_freq[frame] = freq
    detected = sorted(frame_freq.items(), key=lambda x: x[1], reverse=True)
    frame_mapping_1 = {}
    if detected:
        frame_mapping_1[detected[0][0]] = "Major Focus"
        for frame, _ in detected[1:3]:
            frame_mapping_1[frame] = "Significant Focus"
        for frame, _ in detected[3:]:
            frame_mapping_1[frame] = "Minor Mention"
    for frame in frame_categories.keys():
        if frame not in frame_mapping_1:
            frame_mapping_1[frame] = "Not Applicable"
    return frame_mapping_1

def generate_abstract(text):
    """
    Generates an abstract and recommendations for the given document text
    using a Groq model.
    """
    try:
        # Define the prompt template using LangChain's ChatPromptTemplate.
        # Here we set a system message instructing the model and a placeholder
        # for the user-provided document.
        template = ChatPromptTemplate.from_messages(
            [
                ("system", "Generate an abstract and recommendations for the following document."),
                ("human", "{document}")
            ]
        )
        # Format the prompt with the actual document text
        formatted_prompt = template.format(document=text).to_string()
        
        # Get the response from the Groq API using the designated model.
        response = groq_client.get_completion(
            model="llama2-70b-chat",
            prompt=formatted_prompt
        )
        return response
    except Exception as e:
        logging.error(f"Groq API error: {e}")
        return "Abstract generation failed."


def create_docx_from_data(extracted_data):
    doc = Document()
    for post_number, data in extracted_data.items():
        doc.add_heading(post_number, level=1)
        ordered_keys = [
            "Post Number", "Date of Post", "Media Type", "Number of Pictures",
            "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
            "Full Caption", "Language", "Tone", "Hashtags"
        ]
        for key in ordered_keys:
            value = data.get(key, "N/A")
            if key in ["Tone", "Hashtags"]:
                value = ", ".join(value) if isinstance(value, list) else value
            para = doc.add_paragraph()
            run = para.add_run(f"**{key}:** {value}")
            run.font.size = Pt(11)
        # Add a proper table for Frames if a mapping is available.
        if "FramesMapping" in data:
            doc.add_paragraph("Frames:")
            mapping = data["FramesMapping"]
            table = doc.add_table(rows=1, cols=5)
            table.style = "Light List Accent 1"
            hdr_cells = table.rows[0].cells
            hdr_cells[0].text = "Frame"
            hdr_cells[1].text = "Major Focus"
            hdr_cells[2].text = "Significant Focus"
            hdr_cells[3].text = "Minor Mention"
            hdr_cells[4].text = "Not Applicable"
            tick = "✓"
            for frame, category in mapping.items():
                row_cells = table.add_row().cells
                row_cells[0].text = frame
                row_cells[1].text = tick if category == "Major Focus" else ""
                row_cells[2].text = tick if category == "Significant Focus" else ""
                row_cells[3].text = tick if category == "Minor Mention" else ""
                row_cells[4].text = tick if category == "Not Applicable" else ""
        else:
            value = data.get("Frames", "N/A")
            doc.add_paragraph(f"**Frames:** {value}")
        doc.add_paragraph("\n")
    return doc

# -------------------------------------------------------------------
# Streamlit App UI with Tabs
# -------------------------------------------------------------------

st.title("AI-Powered Coding Sheet Generator")
st.write("Enter text or upload a DOCX/Excel file for analysis:")

# Create tabs for Standard Analysis and Detailed Analysis
tabs = st.tabs(["Standard Analysis", "Detailed Analysis"])

# -------------------------------------------------------------------
# Standard Analysis Tab
# -------------------------------------------------------------------
with tabs[0]:
    input_text = st.text_area("Input Text", height=200)
    uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"], key="std_docx")
    uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"], key="std_excel")

    output_data = {}

    if input_text:
        frame_mapping = get_frame_category_mapping(input_text)
        frames_table = format_frame_categories_table(frame_mapping)
        output_data["Manual Input"] = {
            "Full Caption": input_text,
            "Language": detect_language(input_text),
            "Tone": extract_tone(input_text),
            "Hashtags": extract_hashtags(input_text),
            "Frames": frames_table,
            "FramesMapping": frame_mapping
        }

    if uploaded_docx:
        captions = extract_captions_from_docx(uploaded_docx)
        for caption, text in captions.items():
            frame_mapping = get_frame_category_mapping(text)
            frames_table = format_frame_categories_table(frame_mapping)
            output_data[caption] = {
                "Full Caption": text,
                "Language": detect_language(text),
                "Tone": extract_tone(text),
                "Hashtags": extract_hashtags(text),
                "Frames": frames_table,
                "FramesMapping": frame_mapping
            }

    if uploaded_excel:
        excel_metadata = extract_metadata_from_excel(uploaded_excel)
        output_data = merge_metadata_with_generated_data(output_data, excel_metadata)

    if output_data:
        for post_number, data in output_data.items():
            with st.expander(post_number):
                for key, value in data.items():
                    if key == "Frames":
                        st.markdown(f"**{key}:**\n{value}")
                    else:
                        st.write(f"**{key}:** {value}")

    if output_data:
        docx_output = create_docx_from_data(output_data)
        docx_io = io.BytesIO()
        docx_output.save(docx_io)
        docx_io.seek(0)
        st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")

# -------------------------------------------------------------------
# Detailed Analysis Tab
# -------------------------------------------------------------------
with tabs[1]:
    st.title("Detailed DOCX Analysis")
    
    uploaded_docx = st.file_uploader("Upload DOCX file", type=["docx"])
    if uploaded_docx:
        captions = extract_captions_from_docx(uploaded_docx)
        total_posts = len(captions)
        st.write(f"**Total number of posts:** {total_posts}")
        
        language_counter = Counter()
        tone_counter = Counter()
        frame_counter = {frame: Counter() for frame in frame_categories.keys()}
        hashtag_counter = Counter()
        
        for post, text in captions.items():
            lang = detect_language(text)
            language_counter[lang] += 1
            tones = extract_tone(text)
            for tone in tones:
                tone_counter[tone] += 1
            frame_mapping_1 = extract_frame_focus(text)
            for frame, category in frame_mapping_1.items():
                frame_counter[frame][category] += 1
            hashtags = extract_hashtags(text)
            for hashtag in hashtags:
                hashtag_counter[hashtag] += 1
        
        st.subheader("Language Distribution")
        st.write(dict(language_counter))
        
        st.subheader("Tone Distribution")
        st.write(dict(tone_counter))
        
        st.subheader("Frame Distribution")
        for frame, counts in frame_counter.items():
            st.write(f"**{frame}:** {dict(counts)}")
        
        st.subheader("Hashtag Distribution")
        st.write(dict(hashtag_counter))
        
        combined_text = " ".join(captions.values())
        abstract = generate_abstract(combined_text)
        st.subheader("Abstract & Recommendations")
        st.write(abstract)
        
        doc = Document()
        doc.add_heading("Analysis Summary", 0)
        doc.add_paragraph(f"Total number of posts: {total_posts}")
        
        doc.add_heading("Language Distribution", level=1)
        for lang, count in language_counter.items():
            doc.add_paragraph(f"{lang}: {count}")
        
        doc.add_heading("Tone Distribution", level=1)
        for tone, count in tone_counter.items():
            doc.add_paragraph(f"{tone}: {count}")
        
        doc.add_heading("Frame Distribution", level=1)
        for frame, counts in frame_counter.items():
            doc.add_paragraph(f"{frame}: {dict(counts)}")
        
        doc.add_heading("Hashtag Distribution", level=1)
        for hashtag, count in hashtag_counter.items():
            doc.add_paragraph(f"{hashtag}: {count}")
        
        doc.add_heading("Abstract & Recommendations", level=1)
        doc.add_paragraph(abstract)
        
        docx_io = io.BytesIO()
        doc.save(docx_io)
        docx_io.seek(0)
        st.download_button("Download Analysis Summary as DOCX", data=docx_io, file_name="analysis_summary.docx")
    
        # Create an in-memory Excel file
        excel_io = io.BytesIO()
    
        with pd.ExcelWriter(excel_io, engine="xlsxwriter") as writer:
            # Language Distribution sheet
            df_language = pd.DataFrame(list(language_counter.items()), columns=["Language", "Count"])
            df_language.to_excel(writer, index=False, sheet_name="Language Distribution")
            
            # Tone Distribution sheet
            df_tone = pd.DataFrame(list(tone_counter.items()), columns=["Tone", "Count"])
            df_tone.to_excel(writer, index=False, sheet_name="Tone Distribution")
            
            # Frame Distribution sheet
            # Convert the nested dictionary (frame_counter) into a DataFrame
            df_frame = pd.DataFrame.from_dict({frame: dict(counter) for frame, counter in frame_counter.items()}, orient="index").fillna(0).astype(int)
            df_frame.reset_index(inplace=True)
            df_frame.rename(columns={"index": "Frame"}, inplace=True)
            df_frame.to_excel(writer, index=False, sheet_name="Frame Distribution")
            
            # Hashtag Distribution sheet
            df_hashtag = pd.DataFrame(list(hashtag_counter.items()), columns=["Hashtag", "Count"])
            df_hashtag.to_excel(writer, index=False, sheet_name="Hashtag Distribution")
            
            # Abstract & Recommendations sheet
            df_abstract = pd.DataFrame({"Abstract & Recommendations": [abstract]})
            df_abstract.to_excel(writer, index=False, sheet_name="Abstract")
            
            writer.close()
        
        excel_io.seek(0)
        
        # Download button for the Excel file
        st.download_button(
            label="Download Analysis Data as Excel",
            data=excel_io,
            file_name="analysis_data.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )