AA_F3 / app.py
ahm14's picture
Update app.py
985e391 verified
import os
import pandas as pd
import streamlit as st
import re
import logging
import nltk
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import io
from langdetect import detect
from collections import Counter
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load environment variables
load_dotenv()
# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# --- Initialize DeepSeek-V3-0324 locally ---
MODEL_NAME = "deepseek-ai/DeepSeek-V3-0324"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
def generate_response(prompt: str, max_length: int = 150, temperature: float = 0.5) -> str:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
outputs = model.generate(
input_ids,
max_length=max_length,
do_sample=True,
temperature=temperature,
top_p=0.95
)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
return result.strip()
def extract_keywords(text: str) -> list:
"""
Use DeepSeek-V3-0324 to extract keywords from the input text.
The prompt asks for a comma-separated list.
"""
prompt = (f"Extract the most important keywords from the following text. "
f"Return them as a comma-separated list.\n\nText: \"{text}\"")
response = generate_response(prompt, max_length=100, temperature=0.5)
keywords = [kw.strip() for kw in response.split(",") if kw.strip()]
return keywords
def suggest_themes(keywords: list) -> list:
"""
Use DeepSeek-V3-0324 to suggest relevant themes based on the extracted keywords.
"""
keywords_str = ", ".join(keywords)
prompt = (f"Based on the following keywords: {keywords_str}, "
f"suggest a list of relevant themes. Return them as a comma-separated list.")
response = generate_response(prompt, max_length=100, temperature=0.5)
themes = [theme.strip() for theme in response.split(",") if theme.strip()]
return themes
# --- Retain or slightly adjust other helper functions ---
def detect_language(text):
try:
return detect(text)
except Exception as e:
logging.error(f"Error detecting language: {e}")
return "unknown"
def extract_hashtags(text):
return re.findall(r"#\w+", text)
def extract_captions_from_docx(docx_file):
doc = Document(docx_file)
captions = {}
current_post = None
for para in doc.paragraphs:
text = para.text.strip()
if re.match(r"Post \d+", text, re.IGNORECASE):
current_post = text
captions[current_post] = []
elif current_post:
captions[current_post].append(text)
return {post: " ".join(lines) for post, lines in captions.items() if lines}
def extract_metadata_from_excel(excel_file):
try:
df = pd.read_excel(excel_file)
extracted_data = df.to_dict(orient="records")
return extracted_data
except Exception as e:
logging.error(f"Error processing Excel file: {e}")
return []
def merge_metadata_with_generated_data(generated_data, excel_metadata):
for post_data in excel_metadata:
post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
if post_number in generated_data:
generated_data[post_number].update(post_data)
else:
generated_data[post_number] = post_data
return generated_data
def format_frame_categories_table(category_mapping):
header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
header += "| --- | --- | --- | --- | --- |\n"
tick = "✓"
rows = ""
for frame, category in category_mapping.items():
major = tick if category == "Major Focus" else ""
significant = tick if category == "Significant Focus" else ""
minor = tick if category == "Minor Mention" else ""
not_applicable = tick if category == "Not Applicable" else ""
rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n"
return header + rows
def get_frame_category_mapping(text):
"""
Returns a mapping for frames based on the frequency of certain keywords.
"""
text_lower = text.lower()
frame_categories = {
"Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
"Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
"Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
"Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
"Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
"Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
"Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
"Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
"Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
"Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
"Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
"Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
"Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
"Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
"Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
}
frame_freq = {}
for frame, keywords in frame_categories.items():
freq = sum(1 for word in keywords if word in text_lower)
frame_freq[frame] = freq
detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
detected.sort(key=lambda x: x[1], reverse=True)
category_mapping = {}
if detected:
category_mapping[detected[0][0]] = "Major Focus"
for frame, _ in detected[1:3]:
category_mapping[frame] = "Significant Focus"
for frame, _ in detected[3:]:
category_mapping[frame] = "Minor Mention"
for frame in frame_categories.keys():
if frame not in category_mapping:
category_mapping[frame] = "Not Applicable"
return category_mapping
def create_docx_from_data(extracted_data):
doc = Document()
for post_number, data in extracted_data.items():
doc.add_heading(post_number, level=1)
ordered_keys = [
"Post Number", "Date of Post", "Media Type", "Number of Pictures",
"Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
"Full Caption", "Language", "Tone", "Hashtags", "Keywords"
]
for key in ordered_keys:
value = data.get(key, "N/A")
if key in ["Tone", "Hashtags", "Keywords"]:
value = ", ".join(value) if isinstance(value, list) else value
para = doc.add_paragraph()
run = para.add_run(f"**{key}:** {value}")
run.font.size = Pt(11)
if "FramesMapping" in data:
doc.add_paragraph("Frames:")
mapping = data["FramesMapping"]
table = doc.add_table(rows=1, cols=5)
table.style = "Light List Accent 1"
hdr_cells = table.rows[0].cells
hdr_cells[0].text = "Frame"
hdr_cells[1].text = "Major Focus"
hdr_cells[2].text = "Significant Focus"
hdr_cells[3].text = "Minor Mention"
hdr_cells[4].text = "Not Applicable"
tick = "✓"
for frame, category in mapping.items():
row_cells = table.add_row().cells
row_cells[0].text = frame
row_cells[1].text = tick if category == "Major Focus" else ""
row_cells[2].text = tick if category == "Significant Focus" else ""
row_cells[3].text = tick if category == "Minor Mention" else ""
row_cells[4].text = tick if category == "Not Applicable" else ""
else:
value = data.get("Frames", "N/A")
doc.add_paragraph(f"**Frames:** {value}")
# --- New: Summary Table for Keywords, Themes, and Frames ---
keywords = data.get("Keywords", [])
# Generate themes using DeepSeek-based function
themes = suggest_themes(keywords) if keywords else []
doc.add_paragraph("Summary Table:")
summary_table = doc.add_table(rows=1, cols=3)
summary_table.style = "Light List Accent 1"
hdr_cells = summary_table.rows[0].cells
hdr_cells[0].text = "Keywords"
hdr_cells[1].text = "Themes"
hdr_cells[2].text = "Frames"
row_cells = summary_table.add_row().cells
row_cells[0].text = ", ".join(keywords) if keywords else "N/A"
row_cells[1].text = ", ".join(themes) if themes else "N/A"
frames_from_mapping = data.get("FramesMapping", {})
frames_list = ", ".join([f"{frame} ({cat})" for frame, cat in frames_from_mapping.items()])
row_cells[2].text = frames_list if frames_list else "N/A"
doc.add_paragraph("\n")
return doc
# --- Streamlit App UI ---
st.title("AI-Powered Coding Sheet Generator")
st.write("Enter text or upload a DOCX/Excel file for analysis:")
input_text = st.text_area("Input Text", height=200)
uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
output_data = {}
if input_text:
frame_mapping = get_frame_category_mapping(input_text)
frames_table = format_frame_categories_table(frame_mapping)
# Use the DeepSeek-based keyword extraction
keywords = extract_keywords(input_text)
# For demonstration, reusing the extract_keywords for Tone as well (consider creating a dedicated tone function)
tone = extract_keywords(input_text)
output_data["Manual Input"] = {
"Full Caption": input_text,
"Language": detect_language(input_text),
"Tone": tone,
"Hashtags": extract_hashtags(input_text),
"Frames": frames_table,
"FramesMapping": frame_mapping,
"Keywords": keywords
}
if uploaded_docx:
captions = extract_captions_from_docx(uploaded_docx)
for caption, text in captions.items():
frame_mapping = get_frame_category_mapping(text)
frames_table = format_frame_categories_table(frame_mapping)
keywords = extract_keywords(text)
tone = extract_keywords(text)
output_data[caption] = {
"Full Caption": text,
"Language": detect_language(text),
"Tone": tone,
"Hashtags": extract_hashtags(text),
"Frames": frames_table,
"FramesMapping": frame_mapping,
"Keywords": keywords
}
if uploaded_excel:
excel_metadata = extract_metadata_from_excel(uploaded_excel)
output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
if output_data:
for post_number, data in output_data.items():
with st.expander(post_number):
for key, value in data.items():
if key == "Frames":
st.markdown(f"**{key}:**\n{value}")
else:
st.write(f"**{key}:** {value}")
if output_data:
docx_output = create_docx_from_data(output_data)
docx_io = io.BytesIO()
docx_output.save(docx_io)
docx_io.seek(0)
st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")