import os import pandas as pd import streamlit as st import re import logging import nltk from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.shared import Pt import io from langdetect import detect from collections import Counter from dotenv import load_dotenv from langchain_groq import ChatGroq from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from transformers import pipeline from groq import Groq # Load environment variables load_dotenv() # Check if Groq API key is available GROQ_API_KEY = os.getenv("GROQ_API_KEY") if not GROQ_API_KEY: logging.error("Missing Groq API key. Please set the GROQ_API_KEY environment variable.") st.error("API key is missing. Please provide a valid API key.") # Initialize logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") groq_client = Groq(api_key=GROQ_API_KEY) # Initialize LLM (Groq API) llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192") # Download required NLTK resources nltk.download("punkt") # Tone categories for fallback method tone_categories = { "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"], "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"], "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"], "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"], "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"], "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"], "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"], "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"], "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"], "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"] } # Frame categories for fallback method frame_categories = { "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"], "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"], "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"], "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"], "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"], "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"], "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"], "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"], "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"], "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"], "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"], "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"], "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"], "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"], "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"] } # Detect language def detect_language(text): try: return detect(text) except Exception as e: logging.error(f"Error detecting language: {e}") return "unknown" # Extract tone using Groq API (or fallback method) def extract_tone(text): try: response = llm.chat([ {"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."}, {"role": "user", "content": text} ]) return response["choices"][0]["message"]["content"].split(", ") except Exception as e: logging.error(f"Groq API error: {e}") return extract_tone_fallback(text) # Fallback method for tone extraction def extract_tone_fallback(text): detected_tones = set() text_lower = text.lower() for category, keywords in tone_categories.items(): if any(word in text_lower for word in keywords): detected_tones.add(category) return list(detected_tones) if detected_tones else ["Neutral"] # Extract hashtags def extract_hashtags(text): return re.findall(r"#\w+", text) # ------------------------------------------------------------------- # New functions for frame categorization and display # ------------------------------------------------------------------- def get_frame_category_mapping(text): """ Returns a mapping of every frame (from frame_categories) to one of the four categories. Detected frames are assigned a focus level based on keyword frequency: - Top detected: "Major Focus" - Next up to two: "Significant Focus" - Remaining detected frames: "Minor Mention" Frames not detected get "Not Applicable". """ text_lower = text.lower() # Calculate frequency for each frame frame_freq = {} for frame, keywords in frame_categories.items(): freq = sum(1 for word in keywords if word in text_lower) frame_freq[frame] = freq # Identify detected frames (frequency > 0) and sort descending detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0] detected.sort(key=lambda x: x[1], reverse=True) category_mapping = {} if detected: # Highest frequency frame as Major Focus category_mapping[detected[0][0]] = "Major Focus" # Next up to two frames as Significant Focus for frame, _ in detected[1:3]: category_mapping[frame] = "Significant Focus" # Remaining detected frames as Minor Mention for frame, _ in detected[3:]: category_mapping[frame] = "Minor Mention" # For frames not detected, assign Not Applicable for frame in frame_categories.keys(): if frame not in category_mapping: category_mapping[frame] = "Not Applicable" return category_mapping def format_frame_categories_table(category_mapping): """ Returns a markdown-formatted table displaying each frame with columns: Major Focus, Significant Focus, Minor Mention, and Not Applicable. A tick (✓) marks the assigned category. """ header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n" header += "| --- | --- | --- | --- | --- |\n" tick = "✓" rows = "" for frame, category in category_mapping.items(): major = tick if category == "Major Focus" else "" significant = tick if category == "Significant Focus" else "" minor = tick if category == "Minor Mention" else "" not_applicable = tick if category == "Not Applicable" else "" rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n" return header + rows # ------------------------------------------------------------------- # Existing functions for file processing # ------------------------------------------------------------------- def extract_captions_from_docx(docx_file): doc = Document(docx_file) captions = {} current_post = None for para in doc.paragraphs: text = para.text.strip() if re.match(r"Post \d+", text, re.IGNORECASE): current_post = text captions[current_post] = [] elif current_post: captions[current_post].append(text) return {post: " ".join(lines) for post, lines in captions.items() if lines} def extract_metadata_from_excel(excel_file): try: df = pd.read_excel(excel_file) extracted_data = df.to_dict(orient="records") return extracted_data except Exception as e: logging.error(f"Error processing Excel file: {e}") return [] def merge_metadata_with_generated_data(generated_data, excel_metadata): for post_data in excel_metadata: post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}" if post_number in generated_data: generated_data[post_number].update(post_data) else: generated_data[post_number] = post_data return generated_data def extract_frame_focus(text): text_lower = text.lower() frame_freq = {} for frame, keywords in frame_categories.items(): freq = sum(1 for word in keywords if word in text_lower) frame_freq[frame] = freq detected = sorted(frame_freq.items(), key=lambda x: x[1], reverse=True) frame_mapping_1 = {} if detected: frame_mapping_1[detected[0][0]] = "Major Focus" for frame, _ in detected[1:3]: frame_mapping_1[frame] = "Significant Focus" for frame, _ in detected[3:]: frame_mapping_1[frame] = "Minor Mention" for frame in frame_categories.keys(): if frame not in frame_mapping_1: frame_mapping_1[frame] = "Not Applicable" return frame_mapping_1 def generate_abstract(text): """ Generates an abstract and recommendations for the given document text using a Groq model. """ try: # Define the prompt template using LangChain's ChatPromptTemplate. # Here we set a system message instructing the model and a placeholder # for the user-provided document. template = ChatPromptTemplate.from_messages( [ ("system", "Generate an abstract and recommendations for the following document."), ("human", "{document}") ] ) # Format the prompt with the actual document text formatted_prompt = template.format(document=text).to_string() # Get the response from the Groq API using the designated model. response = groq_client.get_completion( model="llama2-70b-chat", prompt=formatted_prompt ) return response except Exception as e: logging.error(f"Groq API error: {e}") return "Abstract generation failed." def create_docx_from_data(extracted_data): doc = Document() for post_number, data in extracted_data.items(): doc.add_heading(post_number, level=1) ordered_keys = [ "Post Number", "Date of Post", "Media Type", "Number of Pictures", "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience", "Full Caption", "Language", "Tone", "Hashtags" ] for key in ordered_keys: value = data.get(key, "N/A") if key in ["Tone", "Hashtags"]: value = ", ".join(value) if isinstance(value, list) else value para = doc.add_paragraph() run = para.add_run(f"**{key}:** {value}") run.font.size = Pt(11) # Add a proper table for Frames if a mapping is available. if "FramesMapping" in data: doc.add_paragraph("Frames:") mapping = data["FramesMapping"] table = doc.add_table(rows=1, cols=5) table.style = "Light List Accent 1" hdr_cells = table.rows[0].cells hdr_cells[0].text = "Frame" hdr_cells[1].text = "Major Focus" hdr_cells[2].text = "Significant Focus" hdr_cells[3].text = "Minor Mention" hdr_cells[4].text = "Not Applicable" tick = "✓" for frame, category in mapping.items(): row_cells = table.add_row().cells row_cells[0].text = frame row_cells[1].text = tick if category == "Major Focus" else "" row_cells[2].text = tick if category == "Significant Focus" else "" row_cells[3].text = tick if category == "Minor Mention" else "" row_cells[4].text = tick if category == "Not Applicable" else "" else: value = data.get("Frames", "N/A") doc.add_paragraph(f"**Frames:** {value}") doc.add_paragraph("\n") return doc # ------------------------------------------------------------------- # Streamlit App UI with Tabs # ------------------------------------------------------------------- st.title("AI-Powered Coding Sheet Generator") st.write("Enter text or upload a DOCX/Excel file for analysis:") # Create tabs for Standard Analysis and Detailed Analysis tabs = st.tabs(["Standard Analysis", "Detailed Analysis"]) # ------------------------------------------------------------------- # Standard Analysis Tab # ------------------------------------------------------------------- with tabs[0]: input_text = st.text_area("Input Text", height=200) uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"], key="std_docx") uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"], key="std_excel") output_data = {} if input_text: frame_mapping = get_frame_category_mapping(input_text) frames_table = format_frame_categories_table(frame_mapping) output_data["Manual Input"] = { "Full Caption": input_text, "Language": detect_language(input_text), "Tone": extract_tone(input_text), "Hashtags": extract_hashtags(input_text), "Frames": frames_table, "FramesMapping": frame_mapping } if uploaded_docx: captions = extract_captions_from_docx(uploaded_docx) for caption, text in captions.items(): frame_mapping = get_frame_category_mapping(text) frames_table = format_frame_categories_table(frame_mapping) output_data[caption] = { "Full Caption": text, "Language": detect_language(text), "Tone": extract_tone(text), "Hashtags": extract_hashtags(text), "Frames": frames_table, "FramesMapping": frame_mapping } if uploaded_excel: excel_metadata = extract_metadata_from_excel(uploaded_excel) output_data = merge_metadata_with_generated_data(output_data, excel_metadata) if output_data: for post_number, data in output_data.items(): with st.expander(post_number): for key, value in data.items(): if key == "Frames": st.markdown(f"**{key}:**\n{value}") else: st.write(f"**{key}:** {value}") if output_data: docx_output = create_docx_from_data(output_data) docx_io = io.BytesIO() docx_output.save(docx_io) docx_io.seek(0) st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx") # ------------------------------------------------------------------- # Detailed Analysis Tab # ------------------------------------------------------------------- with tabs[1]: st.title("Detailed DOCX Analysis") uploaded_docx = st.file_uploader("Upload DOCX file", type=["docx"]) if uploaded_docx: captions = extract_captions_from_docx(uploaded_docx) total_posts = len(captions) st.write(f"**Total number of posts:** {total_posts}") language_counter = Counter() tone_counter = Counter() frame_counter = {frame: Counter() for frame in frame_categories.keys()} hashtag_counter = Counter() for post, text in captions.items(): lang = detect_language(text) language_counter[lang] += 1 tones = extract_tone(text) for tone in tones: tone_counter[tone] += 1 frame_mapping_1 = extract_frame_focus(text) for frame, category in frame_mapping_1.items(): frame_counter[frame][category] += 1 hashtags = extract_hashtags(text) for hashtag in hashtags: hashtag_counter[hashtag] += 1 st.subheader("Language Distribution") st.write(dict(language_counter)) st.subheader("Tone Distribution") st.write(dict(tone_counter)) st.subheader("Frame Distribution") for frame, counts in frame_counter.items(): st.write(f"**{frame}:** {dict(counts)}") st.subheader("Hashtag Distribution") st.write(dict(hashtag_counter)) combined_text = " ".join(captions.values()) abstract = generate_abstract(combined_text) st.subheader("Abstract & Recommendations") st.write(abstract) doc = Document() doc.add_heading("Analysis Summary", 0) doc.add_paragraph(f"Total number of posts: {total_posts}") doc.add_heading("Language Distribution", level=1) for lang, count in language_counter.items(): doc.add_paragraph(f"{lang}: {count}") doc.add_heading("Tone Distribution", level=1) for tone, count in tone_counter.items(): doc.add_paragraph(f"{tone}: {count}") doc.add_heading("Frame Distribution", level=1) for frame, counts in frame_counter.items(): doc.add_paragraph(f"{frame}: {dict(counts)}") doc.add_heading("Hashtag Distribution", level=1) for hashtag, count in hashtag_counter.items(): doc.add_paragraph(f"{hashtag}: {count}") doc.add_heading("Abstract & Recommendations", level=1) doc.add_paragraph(abstract) docx_io = io.BytesIO() doc.save(docx_io) docx_io.seek(0) st.download_button("Download Analysis Summary as DOCX", data=docx_io, file_name="analysis_summary.docx") # Create an in-memory Excel file excel_io = io.BytesIO() with pd.ExcelWriter(excel_io, engine="xlsxwriter") as writer: # Language Distribution sheet df_language = pd.DataFrame(list(language_counter.items()), columns=["Language", "Count"]) df_language.to_excel(writer, index=False, sheet_name="Language Distribution") # Tone Distribution sheet df_tone = pd.DataFrame(list(tone_counter.items()), columns=["Tone", "Count"]) df_tone.to_excel(writer, index=False, sheet_name="Tone Distribution") # Frame Distribution sheet # Convert the nested dictionary (frame_counter) into a DataFrame df_frame = pd.DataFrame.from_dict({frame: dict(counter) for frame, counter in frame_counter.items()}, orient="index").fillna(0).astype(int) df_frame.reset_index(inplace=True) df_frame.rename(columns={"index": "Frame"}, inplace=True) df_frame.to_excel(writer, index=False, sheet_name="Frame Distribution") # Hashtag Distribution sheet df_hashtag = pd.DataFrame(list(hashtag_counter.items()), columns=["Hashtag", "Count"]) df_hashtag.to_excel(writer, index=False, sheet_name="Hashtag Distribution") # Abstract & Recommendations sheet df_abstract = pd.DataFrame({"Abstract & Recommendations": [abstract]}) df_abstract.to_excel(writer, index=False, sheet_name="Abstract") writer.close() excel_io.seek(0) # Download button for the Excel file st.download_button( label="Download Analysis Data as Excel", data=excel_io, file_name="analysis_data.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" )