import os
import pandas as pd
import streamlit as st
import re
import logging
import nltk
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import io
from langdetect import detect
from collections import Counter
from dotenv import load_dotenv
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

# Import KeyBERT
from keybert import KeyBERT

# Load environment variables
load_dotenv()

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Download required NLTK resources
nltk.download("punkt")
nltk.download("stopwords")

# -------------------------------------------------------------------
# KeyBERT Model Initialization for Keyword Extraction
# -------------------------------------------------------------------
# You can use a different sentence-transformers model if you wish.
kw_model = KeyBERT(model="all-MiniLM-L6-v2")

def extract_keywords_with_keybert(text, top_n= 15):
    """
    Uses KeyBERT to extract keywords from the text.
    Returns a list of keywords.
    """
    try:
        # Extract keywords along with their relevance scores.
        keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=top_n)
        # Return only the keyword texts
        return [kw[0] for kw in keywords]
    except Exception as e:
        logging.error(f"KeyBERT keyword extraction error: {e}")
        return []

# -------------------------------------------------------------------
# New Functions for Theme Assignment and Frame Assignment
# -------------------------------------------------------------------

def assign_themes(keywords):
    """
    Assigns one or more themes based on the extracted keywords.
    Each theme has an associated list of terms.
    """
    theme_mapping = {
        "Social Justice": ["inequality", "activism", "rights", "justice", "protest"],
        "Environmental": ["climate", "pollution", "sustainability", "deforestation", "environment"],
        "Political": ["government", "policy", "election", "politics", "reform"],
        "Economic": ["economy", "finance", "market", "investment", "trade"],
        "Technological": ["technology", "ai", "machine learning", "innovation", "digital"]
    }
    # Score each theme based on how many of its keywords appear in the extracted keywords.
    theme_scores = {theme: 0 for theme in theme_mapping}
    for kw in keywords:
        kw_lower = kw.lower()
        for theme, terms in theme_mapping.items():
            if any(term in kw_lower for term in terms):
                theme_scores[theme] += 1
    max_score = max(theme_scores.values())
    if max_score == 0:
        return ["General"]
    # Return all themes that reached the maximum score.
    assigned = [theme for theme, score in theme_scores.items() if score == max_score]
    return assigned

def assign_frames(themes):
    """
    Maps the assigned themes to frames.
    """
    theme_to_frame = {
        "Social Justice": "Human Rights & Justice",
        "Environmental": "Environmental Crisis & Activism",
        "Political": "Political & State Accountability",
        "Economic": "Social Inequality & Economic Disparities",
        "Technological": "Activism & Advocacy",
        "General": "Informative"
    }
    frames = [theme_to_frame.get(theme, "Not Applicable") for theme in themes]
    return frames

# -------------------------------------------------------------------
# Other Functions (Language Detection, Hashtags, DOCX/Excel Processing, etc.)
# -------------------------------------------------------------------

def detect_language(text):
    try:
        return detect(text)
    except Exception as e:
        logging.error(f"Error detecting language: {e}")
        return "unknown"

def extract_hashtags(text):
    return re.findall(r"#\w+", text)

def extract_captions_from_docx(docx_file):
    doc = Document(docx_file)
    captions = {}
    current_post = None
    for para in doc.paragraphs:
        text = para.text.strip()
        if re.match(r"Post \d+", text, re.IGNORECASE):
            current_post = text
            captions[current_post] = []
        elif current_post:
            captions[current_post].append(text)
    return {post: " ".join(lines) for post, lines in captions.items() if lines}

def extract_metadata_from_excel(excel_file):
    try:
        df = pd.read_excel(excel_file)
        extracted_data = df.to_dict(orient="records")
        return extracted_data
    except Exception as e:
        logging.error(f"Error processing Excel file: {e}")
        return []

def merge_metadata_with_generated_data(generated_data, excel_metadata):
    for post_data in excel_metadata:
        post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
        if post_number in generated_data:
            generated_data[post_number].update(post_data)
        else:
            generated_data[post_number] = post_data  
    return generated_data

def create_docx_from_data(extracted_data):
    doc = Document()
    for post_number, data in extracted_data.items():
        doc.add_heading(post_number, level=1)
        ordered_keys = [
            "Post Number", "Date of Post", "Media Type", "Number of Pictures",
            "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
            "Full Caption", "Language", "Tone", "Hashtags", "Keywords", "Themes", "Frames"
        ]
        for key in ordered_keys:
            value = data.get(key, "N/A")
            # If the value is a list, join it into a string
            if isinstance(value, list):
                value = ", ".join(value)
            para = doc.add_paragraph()
            run = para.add_run(f"**{key}:** {value}")
            run.font.size = Pt(11)
        doc.add_paragraph("\n")
    return doc

# -------------------------------------------------------------------
# Streamlit App UI
# -------------------------------------------------------------------

st.title("AI-Powered Coding Sheet Generator")
st.write("Enter text or upload a DOCX/Excel file for analysis:")

input_text = st.text_area("Input Text", height=200)
uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])

output_data = {}

if input_text:
    # Keyword extraction using KeyBERT
    keywords = extract_keywords_with_keybert(input_text)
    themes = assign_themes(keywords)
    frames = assign_frames(themes)
    
    output_data["Manual Input"] = {
        "Full Caption": input_text,
        "Language": detect_language(input_text),
        "Keywords": keywords,
        "Themes": themes,
        "Frames": frames,
        "Hashtags": extract_hashtags(input_text)
    }

if uploaded_docx:
    captions = extract_captions_from_docx(uploaded_docx)
    for caption, text in captions.items():
        keywords = extract_keywords_with_keybert(text)
        themes = assign_themes(keywords)
        frames = assign_frames(themes)
        
        output_data[caption] = {
            "Full Caption": text,
            "Language": detect_language(text),
            "Keywords": keywords,
            "Themes": themes,
            "Frames": frames,
            "Hashtags": extract_hashtags(text)
        }

if uploaded_excel:
    excel_metadata = extract_metadata_from_excel(uploaded_excel)
    output_data = merge_metadata_with_generated_data(output_data, excel_metadata)

if output_data:
    for post_number, data in output_data.items():
        with st.expander(post_number):
            for key, value in data.items():
                st.write(f"**{key}:** {value}")

if output_data:
    docx_output = create_docx_from_data(output_data)
    docx_io = io.BytesIO()
    docx_output.save(docx_io)
    docx_io.seek(0)
    st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")