AA_F4 / app.py
ahm14's picture
Update app.py
5b55d6b verified
import os
import pandas as pd
import streamlit as st
import re
import logging
import nltk
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import io
from langdetect import detect
from collections import Counter
from dotenv import load_dotenv
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
# Import KeyBERT
from keybert import KeyBERT
# Load environment variables
load_dotenv()
# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# Download required NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
# -------------------------------------------------------------------
# KeyBERT Model Initialization for Keyword Extraction
# -------------------------------------------------------------------
# You can use a different sentence-transformers model if you wish.
kw_model = KeyBERT(model="all-MiniLM-L6-v2")
def extract_keywords_with_keybert(text, top_n= 15):
"""
Uses KeyBERT to extract keywords from the text.
Returns a list of keywords.
"""
try:
# Extract keywords along with their relevance scores.
keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=top_n)
# Return only the keyword texts
return [kw[0] for kw in keywords]
except Exception as e:
logging.error(f"KeyBERT keyword extraction error: {e}")
return []
# -------------------------------------------------------------------
# New Functions for Theme Assignment and Frame Assignment
# -------------------------------------------------------------------
def assign_themes(keywords):
"""
Assigns one or more themes based on the extracted keywords.
Each theme has an associated list of terms.
"""
theme_mapping = {
"Social Justice": ["inequality", "activism", "rights", "justice", "protest"],
"Environmental": ["climate", "pollution", "sustainability", "deforestation", "environment"],
"Political": ["government", "policy", "election", "politics", "reform"],
"Economic": ["economy", "finance", "market", "investment", "trade"],
"Technological": ["technology", "ai", "machine learning", "innovation", "digital"]
}
# Score each theme based on how many of its keywords appear in the extracted keywords.
theme_scores = {theme: 0 for theme in theme_mapping}
for kw in keywords:
kw_lower = kw.lower()
for theme, terms in theme_mapping.items():
if any(term in kw_lower for term in terms):
theme_scores[theme] += 1
max_score = max(theme_scores.values())
if max_score == 0:
return ["General"]
# Return all themes that reached the maximum score.
assigned = [theme for theme, score in theme_scores.items() if score == max_score]
return assigned
def assign_frames(themes):
"""
Maps the assigned themes to frames.
"""
theme_to_frame = {
"Social Justice": "Human Rights & Justice",
"Environmental": "Environmental Crisis & Activism",
"Political": "Political & State Accountability",
"Economic": "Social Inequality & Economic Disparities",
"Technological": "Activism & Advocacy",
"General": "Informative"
}
frames = [theme_to_frame.get(theme, "Not Applicable") for theme in themes]
return frames
# -------------------------------------------------------------------
# Other Functions (Language Detection, Hashtags, DOCX/Excel Processing, etc.)
# -------------------------------------------------------------------
def detect_language(text):
try:
return detect(text)
except Exception as e:
logging.error(f"Error detecting language: {e}")
return "unknown"
def extract_hashtags(text):
return re.findall(r"#\w+", text)
def extract_captions_from_docx(docx_file):
doc = Document(docx_file)
captions = {}
current_post = None
for para in doc.paragraphs:
text = para.text.strip()
if re.match(r"Post \d+", text, re.IGNORECASE):
current_post = text
captions[current_post] = []
elif current_post:
captions[current_post].append(text)
return {post: " ".join(lines) for post, lines in captions.items() if lines}
def extract_metadata_from_excel(excel_file):
try:
df = pd.read_excel(excel_file)
extracted_data = df.to_dict(orient="records")
return extracted_data
except Exception as e:
logging.error(f"Error processing Excel file: {e}")
return []
def merge_metadata_with_generated_data(generated_data, excel_metadata):
for post_data in excel_metadata:
post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
if post_number in generated_data:
generated_data[post_number].update(post_data)
else:
generated_data[post_number] = post_data
return generated_data
def create_docx_from_data(extracted_data):
doc = Document()
for post_number, data in extracted_data.items():
doc.add_heading(post_number, level=1)
ordered_keys = [
"Post Number", "Date of Post", "Media Type", "Number of Pictures",
"Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
"Full Caption", "Language", "Tone", "Hashtags", "Keywords", "Themes", "Frames"
]
for key in ordered_keys:
value = data.get(key, "N/A")
# If the value is a list, join it into a string
if isinstance(value, list):
value = ", ".join(value)
para = doc.add_paragraph()
run = para.add_run(f"**{key}:** {value}")
run.font.size = Pt(11)
doc.add_paragraph("\n")
return doc
# -------------------------------------------------------------------
# Streamlit App UI
# -------------------------------------------------------------------
st.title("AI-Powered Coding Sheet Generator")
st.write("Enter text or upload a DOCX/Excel file for analysis:")
input_text = st.text_area("Input Text", height=200)
uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
output_data = {}
if input_text:
# Keyword extraction using KeyBERT
keywords = extract_keywords_with_keybert(input_text)
themes = assign_themes(keywords)
frames = assign_frames(themes)
output_data["Manual Input"] = {
"Full Caption": input_text,
"Language": detect_language(input_text),
"Keywords": keywords,
"Themes": themes,
"Frames": frames,
"Hashtags": extract_hashtags(input_text)
}
if uploaded_docx:
captions = extract_captions_from_docx(uploaded_docx)
for caption, text in captions.items():
keywords = extract_keywords_with_keybert(text)
themes = assign_themes(keywords)
frames = assign_frames(themes)
output_data[caption] = {
"Full Caption": text,
"Language": detect_language(text),
"Keywords": keywords,
"Themes": themes,
"Frames": frames,
"Hashtags": extract_hashtags(text)
}
if uploaded_excel:
excel_metadata = extract_metadata_from_excel(uploaded_excel)
output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
if output_data:
for post_number, data in output_data.items():
with st.expander(post_number):
for key, value in data.items():
st.write(f"**{key}:** {value}")
if output_data:
docx_output = create_docx_from_data(output_data)
docx_io = io.BytesIO()
docx_output.save(docx_io)
docx_io.seek(0)
st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")