import os import pandas as pd import streamlit as st import re import logging import nltk from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.shared import Pt import io from langdetect import detect from collections import Counter from dotenv import load_dotenv from transformers import AutoModelForCausalLM, AutoTokenizer # Load environment variables load_dotenv() # Initialize logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") # --- Initialize DeepSeek-V3-0324 locally --- MODEL_NAME = "deepseek-ai/DeepSeek-V3-0324" model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) def generate_response(prompt: str, max_length: int = 150, temperature: float = 0.5) -> str: input_ids = tokenizer.encode(prompt, return_tensors="pt") outputs = model.generate( input_ids, max_length=max_length, do_sample=True, temperature=temperature, top_p=0.95 ) result = tokenizer.decode(outputs[0], skip_special_tokens=True) return result.strip() def extract_keywords(text: str) -> list: """ Use DeepSeek-V3-0324 to extract keywords from the input text. The prompt asks for a comma-separated list. """ prompt = (f"Extract the most important keywords from the following text. " f"Return them as a comma-separated list.\n\nText: \"{text}\"") response = generate_response(prompt, max_length=100, temperature=0.5) keywords = [kw.strip() for kw in response.split(",") if kw.strip()] return keywords def suggest_themes(keywords: list) -> list: """ Use DeepSeek-V3-0324 to suggest relevant themes based on the extracted keywords. """ keywords_str = ", ".join(keywords) prompt = (f"Based on the following keywords: {keywords_str}, " f"suggest a list of relevant themes. Return them as a comma-separated list.") response = generate_response(prompt, max_length=100, temperature=0.5) themes = [theme.strip() for theme in response.split(",") if theme.strip()] return themes # --- Retain or slightly adjust other helper functions --- def detect_language(text): try: return detect(text) except Exception as e: logging.error(f"Error detecting language: {e}") return "unknown" def extract_hashtags(text): return re.findall(r"#\w+", text) def extract_captions_from_docx(docx_file): doc = Document(docx_file) captions = {} current_post = None for para in doc.paragraphs: text = para.text.strip() if re.match(r"Post \d+", text, re.IGNORECASE): current_post = text captions[current_post] = [] elif current_post: captions[current_post].append(text) return {post: " ".join(lines) for post, lines in captions.items() if lines} def extract_metadata_from_excel(excel_file): try: df = pd.read_excel(excel_file) extracted_data = df.to_dict(orient="records") return extracted_data except Exception as e: logging.error(f"Error processing Excel file: {e}") return [] def merge_metadata_with_generated_data(generated_data, excel_metadata): for post_data in excel_metadata: post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}" if post_number in generated_data: generated_data[post_number].update(post_data) else: generated_data[post_number] = post_data return generated_data def format_frame_categories_table(category_mapping): header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n" header += "| --- | --- | --- | --- | --- |\n" tick = "✓" rows = "" for frame, category in category_mapping.items(): major = tick if category == "Major Focus" else "" significant = tick if category == "Significant Focus" else "" minor = tick if category == "Minor Mention" else "" not_applicable = tick if category == "Not Applicable" else "" rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n" return header + rows def get_frame_category_mapping(text): """ Returns a mapping for frames based on the frequency of certain keywords. """ text_lower = text.lower() frame_categories = { "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"], "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"], "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"], "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"], "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"], "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"], "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"], "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"], "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"], "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"], "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"], "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"], "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"], "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"], "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"] } frame_freq = {} for frame, keywords in frame_categories.items(): freq = sum(1 for word in keywords if word in text_lower) frame_freq[frame] = freq detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0] detected.sort(key=lambda x: x[1], reverse=True) category_mapping = {} if detected: category_mapping[detected[0][0]] = "Major Focus" for frame, _ in detected[1:3]: category_mapping[frame] = "Significant Focus" for frame, _ in detected[3:]: category_mapping[frame] = "Minor Mention" for frame in frame_categories.keys(): if frame not in category_mapping: category_mapping[frame] = "Not Applicable" return category_mapping def create_docx_from_data(extracted_data): doc = Document() for post_number, data in extracted_data.items(): doc.add_heading(post_number, level=1) ordered_keys = [ "Post Number", "Date of Post", "Media Type", "Number of Pictures", "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience", "Full Caption", "Language", "Tone", "Hashtags", "Keywords" ] for key in ordered_keys: value = data.get(key, "N/A") if key in ["Tone", "Hashtags", "Keywords"]: value = ", ".join(value) if isinstance(value, list) else value para = doc.add_paragraph() run = para.add_run(f"**{key}:** {value}") run.font.size = Pt(11) if "FramesMapping" in data: doc.add_paragraph("Frames:") mapping = data["FramesMapping"] table = doc.add_table(rows=1, cols=5) table.style = "Light List Accent 1" hdr_cells = table.rows[0].cells hdr_cells[0].text = "Frame" hdr_cells[1].text = "Major Focus" hdr_cells[2].text = "Significant Focus" hdr_cells[3].text = "Minor Mention" hdr_cells[4].text = "Not Applicable" tick = "✓" for frame, category in mapping.items(): row_cells = table.add_row().cells row_cells[0].text = frame row_cells[1].text = tick if category == "Major Focus" else "" row_cells[2].text = tick if category == "Significant Focus" else "" row_cells[3].text = tick if category == "Minor Mention" else "" row_cells[4].text = tick if category == "Not Applicable" else "" else: value = data.get("Frames", "N/A") doc.add_paragraph(f"**Frames:** {value}") # --- New: Summary Table for Keywords, Themes, and Frames --- keywords = data.get("Keywords", []) # Generate themes using DeepSeek-based function themes = suggest_themes(keywords) if keywords else [] doc.add_paragraph("Summary Table:") summary_table = doc.add_table(rows=1, cols=3) summary_table.style = "Light List Accent 1" hdr_cells = summary_table.rows[0].cells hdr_cells[0].text = "Keywords" hdr_cells[1].text = "Themes" hdr_cells[2].text = "Frames" row_cells = summary_table.add_row().cells row_cells[0].text = ", ".join(keywords) if keywords else "N/A" row_cells[1].text = ", ".join(themes) if themes else "N/A" frames_from_mapping = data.get("FramesMapping", {}) frames_list = ", ".join([f"{frame} ({cat})" for frame, cat in frames_from_mapping.items()]) row_cells[2].text = frames_list if frames_list else "N/A" doc.add_paragraph("\n") return doc # --- Streamlit App UI --- st.title("AI-Powered Coding Sheet Generator") st.write("Enter text or upload a DOCX/Excel file for analysis:") input_text = st.text_area("Input Text", height=200) uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"]) uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"]) output_data = {} if input_text: frame_mapping = get_frame_category_mapping(input_text) frames_table = format_frame_categories_table(frame_mapping) # Use the DeepSeek-based keyword extraction keywords = extract_keywords(input_text) # For demonstration, reusing the extract_keywords for Tone as well (consider creating a dedicated tone function) tone = extract_keywords(input_text) output_data["Manual Input"] = { "Full Caption": input_text, "Language": detect_language(input_text), "Tone": tone, "Hashtags": extract_hashtags(input_text), "Frames": frames_table, "FramesMapping": frame_mapping, "Keywords": keywords } if uploaded_docx: captions = extract_captions_from_docx(uploaded_docx) for caption, text in captions.items(): frame_mapping = get_frame_category_mapping(text) frames_table = format_frame_categories_table(frame_mapping) keywords = extract_keywords(text) tone = extract_keywords(text) output_data[caption] = { "Full Caption": text, "Language": detect_language(text), "Tone": tone, "Hashtags": extract_hashtags(text), "Frames": frames_table, "FramesMapping": frame_mapping, "Keywords": keywords } if uploaded_excel: excel_metadata = extract_metadata_from_excel(uploaded_excel) output_data = merge_metadata_with_generated_data(output_data, excel_metadata) if output_data: for post_number, data in output_data.items(): with st.expander(post_number): for key, value in data.items(): if key == "Frames": st.markdown(f"**{key}:**\n{value}") else: st.write(f"**{key}:** {value}") if output_data: docx_output = create_docx_from_data(output_data) docx_io = io.BytesIO() docx_output.save(docx_io) docx_io.seek(0) st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")