import os import pandas as pd import streamlit as st import re import logging import nltk from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.shared import Pt import io from langdetect import detect from collections import Counter from dotenv import load_dotenv from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords # Import KeyBERT from keybert import KeyBERT # Load environment variables load_dotenv() # Initialize logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") # Download required NLTK resources nltk.download("punkt") nltk.download("stopwords") # ------------------------------------------------------------------- # KeyBERT Model Initialization for Keyword Extraction # ------------------------------------------------------------------- # You can use a different sentence-transformers model if you wish. kw_model = KeyBERT(model="all-MiniLM-L6-v2") def extract_keywords_with_keybert(text, top_n= 15): """ Uses KeyBERT to extract keywords from the text. Returns a list of keywords. """ try: # Extract keywords along with their relevance scores. keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=top_n) # Return only the keyword texts return [kw[0] for kw in keywords] except Exception as e: logging.error(f"KeyBERT keyword extraction error: {e}") return [] # ------------------------------------------------------------------- # New Functions for Theme Assignment and Frame Assignment # ------------------------------------------------------------------- def assign_themes(keywords): """ Assigns one or more themes based on the extracted keywords. Each theme has an associated list of terms. """ theme_mapping = { "Social Justice": ["inequality", "activism", "rights", "justice", "protest"], "Environmental": ["climate", "pollution", "sustainability", "deforestation", "environment"], "Political": ["government", "policy", "election", "politics", "reform"], "Economic": ["economy", "finance", "market", "investment", "trade"], "Technological": ["technology", "ai", "machine learning", "innovation", "digital"] } # Score each theme based on how many of its keywords appear in the extracted keywords. theme_scores = {theme: 0 for theme in theme_mapping} for kw in keywords: kw_lower = kw.lower() for theme, terms in theme_mapping.items(): if any(term in kw_lower for term in terms): theme_scores[theme] += 1 max_score = max(theme_scores.values()) if max_score == 0: return ["General"] # Return all themes that reached the maximum score. assigned = [theme for theme, score in theme_scores.items() if score == max_score] return assigned def assign_frames(themes): """ Maps the assigned themes to frames. """ theme_to_frame = { "Social Justice": "Human Rights & Justice", "Environmental": "Environmental Crisis & Activism", "Political": "Political & State Accountability", "Economic": "Social Inequality & Economic Disparities", "Technological": "Activism & Advocacy", "General": "Informative" } frames = [theme_to_frame.get(theme, "Not Applicable") for theme in themes] return frames # ------------------------------------------------------------------- # Other Functions (Language Detection, Hashtags, DOCX/Excel Processing, etc.) # ------------------------------------------------------------------- def detect_language(text): try: return detect(text) except Exception as e: logging.error(f"Error detecting language: {e}") return "unknown" def extract_hashtags(text): return re.findall(r"#\w+", text) def extract_captions_from_docx(docx_file): doc = Document(docx_file) captions = {} current_post = None for para in doc.paragraphs: text = para.text.strip() if re.match(r"Post \d+", text, re.IGNORECASE): current_post = text captions[current_post] = [] elif current_post: captions[current_post].append(text) return {post: " ".join(lines) for post, lines in captions.items() if lines} def extract_metadata_from_excel(excel_file): try: df = pd.read_excel(excel_file) extracted_data = df.to_dict(orient="records") return extracted_data except Exception as e: logging.error(f"Error processing Excel file: {e}") return [] def merge_metadata_with_generated_data(generated_data, excel_metadata): for post_data in excel_metadata: post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}" if post_number in generated_data: generated_data[post_number].update(post_data) else: generated_data[post_number] = post_data return generated_data def create_docx_from_data(extracted_data): doc = Document() for post_number, data in extracted_data.items(): doc.add_heading(post_number, level=1) ordered_keys = [ "Post Number", "Date of Post", "Media Type", "Number of Pictures", "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience", "Full Caption", "Language", "Tone", "Hashtags", "Keywords", "Themes", "Frames" ] for key in ordered_keys: value = data.get(key, "N/A") # If the value is a list, join it into a string if isinstance(value, list): value = ", ".join(value) para = doc.add_paragraph() run = para.add_run(f"**{key}:** {value}") run.font.size = Pt(11) doc.add_paragraph("\n") return doc # ------------------------------------------------------------------- # Streamlit App UI # ------------------------------------------------------------------- st.title("AI-Powered Coding Sheet Generator") st.write("Enter text or upload a DOCX/Excel file for analysis:") input_text = st.text_area("Input Text", height=200) uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"]) uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"]) output_data = {} if input_text: # Keyword extraction using KeyBERT keywords = extract_keywords_with_keybert(input_text) themes = assign_themes(keywords) frames = assign_frames(themes) output_data["Manual Input"] = { "Full Caption": input_text, "Language": detect_language(input_text), "Keywords": keywords, "Themes": themes, "Frames": frames, "Hashtags": extract_hashtags(input_text) } if uploaded_docx: captions = extract_captions_from_docx(uploaded_docx) for caption, text in captions.items(): keywords = extract_keywords_with_keybert(text) themes = assign_themes(keywords) frames = assign_frames(themes) output_data[caption] = { "Full Caption": text, "Language": detect_language(text), "Keywords": keywords, "Themes": themes, "Frames": frames, "Hashtags": extract_hashtags(text) } if uploaded_excel: excel_metadata = extract_metadata_from_excel(uploaded_excel) output_data = merge_metadata_with_generated_data(output_data, excel_metadata) if output_data: for post_number, data in output_data.items(): with st.expander(post_number): for key, value in data.items(): st.write(f"**{key}:** {value}") if output_data: docx_output = create_docx_from_data(output_data) docx_io = io.BytesIO() docx_output.save(docx_io) docx_io.seek(0) st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")