AA_F3

Sleeping

App Files Files Community

ahm14 commited on Mar 28, 2025

Commit

985e391

verified ·

1 Parent(s): 56ea0c4

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -223

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import pandas as pd
 import streamlit as st
@@ -12,135 +11,55 @@ import io
 from langdetect import detect
 from collections import Counter
 from dotenv import load_dotenv
-from langchain_groq import ChatGroq
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts import ChatPromptTemplate
-from transformers import pipeline
-from nltk.tokenize import sent_tokenize
-from rake_nltk import Rake
 # Load environment variables
 load_dotenv()
-# Check if Groq API key is available
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-if not GROQ_API_KEY:
-    logging.error("Missing Groq API key. Please set the GROQ_API_KEY environment variable.")
-    st.error("API key is missing. Please provide a valid API key.")
 # Initialize logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# Initialize LLM (Groq API)
-llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192")
-# Download required NLTK resources
-nltk.download("punkt")
-nltk.download("punkt_tab")
-nltk.download("stopwords")
-# Tone categories for fallback method
-tone_categories = {
-    "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
-    "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
-    "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
-    "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
-    "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
-    "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
-    "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
-    "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
-    "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
-    "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
-}
-# Frame categories for fallback method
-frame_categories = {
-    "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
-    "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
-    "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
-    "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
-    "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
-    "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
-    "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
-    "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
-    "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
-    "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
-    "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
-    "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
-    "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
-    "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
-    "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
-}
-def suggest_themes(keywords):
     """
-    Suggest themes based on extracted keywords using a simple mapping.
-    You can adjust the mapping dictionary as needed.
     """
-    theme_mapping = {
-        "violence": "Conflict",
-        "crisis": "Conflict",
-        "repression": "Oppression",
-        "oppression": "Oppression",
-        "freedom": "Empowerment",
-        "hope": "Optimism",
-        "unity": "Solidarity",
-        "progress": "Advancement",
-        "justice": "Social Justice",
-        "rights": "Social Justice",
-        "equality": "Equality",
-        "exploitation": "Exploitation",
-        "mobilize": "Mobilization",
-        "protest": "Activism",
-        "environment": "Environmental",
-        "climate": "Environmental"
-    }
-    suggested = set()
-    for kw in keywords:
-        lower_kw = kw.lower()
-        for key, theme in theme_mapping.items():
-            if key in lower_kw:
-                suggested.add(theme)
-    return list(suggested)
-def suggest_frames(themes):
     """
-    Suggest frames based on the suggested themes.
-    Adjust this mapping to reflect the relationship between themes and your framing categories.
     """
-    frame_mapping = {
-        "Conflict": "Anti-Extremism & Anti-Violence",
-        "Oppression": "Systemic Oppression",
-        "Empowerment": "Empowerment & Resistance",
-        "Optimism": "Hopeful",
-        "Solidarity": "Positive",
-        "Advancement": "Informative",
-        "Social Justice": "Human Rights & Justice",
-        "Equality": "Gender & Patriarchy",
-        "Exploitation": "Political & State Accountability",
-        "Mobilization": "Grassroots Mobilization",
-        "Activism": "Activism & Advocacy",
-        "Environmental": "Environmental Crisis & Activism"
-    }
-    suggested_frames = set()
-    for theme in themes:
-        for key, frame in frame_mapping.items():
-            if key.lower() in theme.lower():
-                suggested_frames.add(frame)
-    return list(suggested_frames)
-def extract_keywords(text):
-    # Initialize RAKE with default NLTK stopwords
-    r = Rake()
-    # Extract keywords from the text
-    r.extract_keywords_from_text(text)
-    # Get ranked phrases (highest ranking first)
-    ranked_phrases = r.get_ranked_phrases()
-    # Return only the top N keywords
-    return ranked_phrases
-# Detect language
 def detect_language(text):
     try:
         return detect(text)
@@ -148,91 +67,9 @@ def detect_language(text):
         logging.error(f"Error detecting language: {e}")
         return "unknown"
-# Extract tone using Groq API (or fallback method)
-def extract_tone(text):
-    try:
-        response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
-                             {"role": "user", "content": text}])
-        return response["choices"][0]["message"]["content"].split(", ")
-    except Exception as e:
-        logging.error(f"Groq API error: {e}")
-        return extract_tone_fallback(text)
-# Fallback method for tone extraction
-def extract_tone_fallback(text):
-    detected_tones = set()
-    text_lower = text.lower()
-    for category, keywords in tone_categories.items():
-        if any(word in text_lower for word in keywords):
-            detected_tones.add(category)
-    return list(detected_tones) if detected_tones else ["Neutral"]
-# Extract hashtags
 def extract_hashtags(text):
     return re.findall(r"#\w+", text)
-# -------------------------------------------------------------------
-# New functions for frame categorization and display
-# -------------------------------------------------------------------
-def get_frame_category_mapping(text):
-    """
-    Returns a mapping of every frame (from frame_categories) to one of the four categories.
-    Detected frames are assigned a focus level based on keyword frequency:
-      - Top detected: "Major Focus"
-      - Next up to two: "Significant Focus"
-      - Remaining detected: "Minor Mention"
-    Frames not detected get "Not Applicable".
-    """
-    text_lower = text.lower()
-    # Calculate frequency for each frame
-    frame_freq = {}
-    for frame, keywords in frame_categories.items():
-        freq = sum(1 for word in keywords if word in text_lower)
-        frame_freq[frame] = freq
-    # Identify detected frames (frequency > 0) and sort descending
-    detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
-    detected.sort(key=lambda x: x[1], reverse=True)
-    category_mapping = {}
-    if detected:
-        # Highest frequency frame as Major Focus
-        category_mapping[detected[0][0]] = "Major Focus"
-        # Next up to two frames as Significant Focus
-        for frame, _ in detected[1:3]:
-            category_mapping[frame] = "Significant Focus"
-        # Remaining detected frames as Minor Mention
-        for frame, _ in detected[3:]:
-            category_mapping[frame] = "Minor Mention"
-    # For frames not detected, assign Not Applicable
-    for frame in frame_categories.keys():
-        if frame not in category_mapping:
-            category_mapping[frame] = "Not Applicable"
-    return category_mapping
-def format_frame_categories_table(category_mapping):
-    """
-    Returns a markdown-formatted table displaying each frame with columns:
-    Major Focus, Significant Focus, Minor Mention, and Not Applicable.
-    A tick (✓) marks the assigned category.
-    """
-    header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
-    header += "| --- | --- | --- | --- | --- |\n"
-    tick = "✓"
-    rows = ""
-    for frame, category in category_mapping.items():
-        major = tick if category == "Major Focus" else ""
-        significant = tick if category == "Significant Focus" else ""
-        minor = tick if category == "Minor Mention" else ""
-        not_applicable = tick if category == "Not Applicable" else ""
-        rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n"
-    return header + rows
-# -------------------------------------------------------------------
-# Existing functions for file processing
-# -------------------------------------------------------------------
 def extract_captions_from_docx(docx_file):
     doc = Document(docx_file)
     captions = {}
@@ -264,6 +101,59 @@ def merge_metadata_with_generated_data(generated_data, excel_metadata):
             generated_data[post_number] = post_data
     return generated_data
 def create_docx_from_data(extracted_data):
     doc = Document()
     for post_number, data in extracted_data.items():
@@ -271,18 +161,15 @@ def create_docx_from_data(extracted_data):
         ordered_keys = [
             "Post Number", "Date of Post", "Media Type", "Number of Pictures",
             "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
-            "Full Caption", "Language", "Tone", "Hashtags", "Keywords"  # Added "Keywords"
         ]
         for key in ordered_keys:
             value = data.get(key, "N/A")
             if key in ["Tone", "Hashtags", "Keywords"]:
-                # For keywords, join the list to a comma-separated string
                 value = ", ".join(value) if isinstance(value, list) else value
             para = doc.add_paragraph()
             run = para.add_run(f"**{key}:** {value}")
             run.font.size = Pt(11)
-        # Existing code to add the Frames table (if present)
         if "FramesMapping" in data:
             doc.add_paragraph("Frames:")
             mapping = data["FramesMapping"]
@@ -305,15 +192,10 @@ def create_docx_from_data(extracted_data):
         else:
             value = data.get("Frames", "N/A")
             doc.add_paragraph(f"**Frames:** {value}")
-        # --- New: Table for Keywords, Themes, and Frames ---
-        # Assume that 'Keywords' is already extracted and stored in data.
         keywords = data.get("Keywords", [])
-        # Generate suggested themes and frames from keywords
         themes = suggest_themes(keywords) if keywords else []
-        frames_from_themes = suggest_frames(themes) if themes else []
-        # Create a new table with 3 columns: Keywords, Themes, Frames
         doc.add_paragraph("Summary Table:")
         summary_table = doc.add_table(rows=1, cols=3)
         summary_table.style = "Light List Accent 1"
@@ -321,21 +203,16 @@ def create_docx_from_data(extracted_data):
         hdr_cells[0].text = "Keywords"
         hdr_cells[1].text = "Themes"
         hdr_cells[2].text = "Frames"
         row_cells = summary_table.add_row().cells
         row_cells[0].text = ", ".join(keywords) if keywords else "N/A"
         row_cells[1].text = ", ".join(themes) if themes else "N/A"
-        row_cells[2].text = ", ".join(frames_from_themes) if frames_from_themes else "N/A"
         doc.add_paragraph("\n")
     return doc
-# -------------------------------------------------------------------
-# Streamlit App UI
-# -------------------------------------------------------------------
 st.title("AI-Powered Coding Sheet Generator")
 st.write("Enter text or upload a DOCX/Excel file for analysis:")
@@ -348,14 +225,18 @@ output_data = {}
 if input_text:
     frame_mapping = get_frame_category_mapping(input_text)
     frames_table = format_frame_categories_table(frame_mapping)
     output_data["Manual Input"] = {
         "Full Caption": input_text,
         "Language": detect_language(input_text),
-        "Tone": extract_tone(input_text),
         "Hashtags": extract_hashtags(input_text),
         "Frames": frames_table,
         "FramesMapping": frame_mapping,
-        "Keywords": extract_keywords(input_text)
     }
 if uploaded_docx:
@@ -363,14 +244,16 @@ if uploaded_docx:
     for caption, text in captions.items():
         frame_mapping = get_frame_category_mapping(text)
         frames_table = format_frame_categories_table(frame_mapping)
         output_data[caption] = {
             "Full Caption": text,
             "Language": detect_language(text),
-            "Tone": extract_tone(text),
             "Hashtags": extract_hashtags(text),
             "Frames": frames_table,
             "FramesMapping": frame_mapping,
-            "Keywords": extract_keywords(text)
         }
 if uploaded_excel:
@@ -391,4 +274,4 @@ if output_data:
     docx_io = io.BytesIO()
     docx_output.save(docx_io)
     docx_io.seek(0)
-    st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")

 import os
 import pandas as pd
 import streamlit as st
 from langdetect import detect
 from collections import Counter
 from dotenv import load_dotenv
+from transformers import AutoModelForCausalLM, AutoTokenizer
 # Load environment variables
 load_dotenv()
 # Initialize logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# --- Initialize DeepSeek-V3-0324 locally ---
+MODEL_NAME = "deepseek-ai/DeepSeek-V3-0324"
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+def generate_response(prompt: str, max_length: int = 150, temperature: float = 0.5) -> str:
+    input_ids = tokenizer.encode(prompt, return_tensors="pt")
+    outputs = model.generate(
+        input_ids,
+        max_length=max_length,
+        do_sample=True,
+        temperature=temperature,
+        top_p=0.95
+    )
+    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return result.strip()
+def extract_keywords(text: str) -> list:
     """
+    Use DeepSeek-V3-0324 to extract keywords from the input text.
+    The prompt asks for a comma-separated list.
     """
+    prompt = (f"Extract the most important keywords from the following text. "
+              f"Return them as a comma-separated list.\n\nText: \"{text}\"")
+    response = generate_response(prompt, max_length=100, temperature=0.5)
+    keywords = [kw.strip() for kw in response.split(",") if kw.strip()]
+    return keywords
+def suggest_themes(keywords: list) -> list:
     """
+    Use DeepSeek-V3-0324 to suggest relevant themes based on the extracted keywords.
     """
+    keywords_str = ", ".join(keywords)
+    prompt = (f"Based on the following keywords: {keywords_str}, "
+              f"suggest a list of relevant themes. Return them as a comma-separated list.")
+    response = generate_response(prompt, max_length=100, temperature=0.5)
+    themes = [theme.strip() for theme in response.split(",") if theme.strip()]
+    return themes
+# --- Retain or slightly adjust other helper functions ---
 def detect_language(text):
     try:
         return detect(text)
         logging.error(f"Error detecting language: {e}")
         return "unknown"
 def extract_hashtags(text):
     return re.findall(r"#\w+", text)
 def extract_captions_from_docx(docx_file):
     doc = Document(docx_file)
     captions = {}
             generated_data[post_number] = post_data
     return generated_data
+def format_frame_categories_table(category_mapping):
+    header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
+    header += "| --- | --- | --- | --- | --- |\n"
+    tick = "✓"
+    rows = ""
+    for frame, category in category_mapping.items():
+        major = tick if category == "Major Focus" else ""
+        significant = tick if category == "Significant Focus" else ""
+        minor = tick if category == "Minor Mention" else ""
+        not_applicable = tick if category == "Not Applicable" else ""
+        rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n"
+    return header + rows
+def get_frame_category_mapping(text):
+    """
+    Returns a mapping for frames based on the frequency of certain keywords.
+    """
+    text_lower = text.lower()
+    frame_categories = {
+        "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
+        "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
+        "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
+        "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
+        "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
+        "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
+        "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
+        "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
+        "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
+        "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
+        "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
+        "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
+        "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
+        "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
+        "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
+    }
+    frame_freq = {}
+    for frame, keywords in frame_categories.items():
+        freq = sum(1 for word in keywords if word in text_lower)
+        frame_freq[frame] = freq
+    detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
+    detected.sort(key=lambda x: x[1], reverse=True)
+    category_mapping = {}
+    if detected:
+        category_mapping[detected[0][0]] = "Major Focus"
+        for frame, _ in detected[1:3]:
+            category_mapping[frame] = "Significant Focus"
+        for frame, _ in detected[3:]:
+            category_mapping[frame] = "Minor Mention"
+    for frame in frame_categories.keys():
+        if frame not in category_mapping:
+            category_mapping[frame] = "Not Applicable"
+    return category_mapping
 def create_docx_from_data(extracted_data):
     doc = Document()
     for post_number, data in extracted_data.items():
         ordered_keys = [
             "Post Number", "Date of Post", "Media Type", "Number of Pictures",
             "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
+            "Full Caption", "Language", "Tone", "Hashtags", "Keywords"
         ]
         for key in ordered_keys:
             value = data.get(key, "N/A")
             if key in ["Tone", "Hashtags", "Keywords"]:
                 value = ", ".join(value) if isinstance(value, list) else value
             para = doc.add_paragraph()
             run = para.add_run(f"**{key}:** {value}")
             run.font.size = Pt(11)
         if "FramesMapping" in data:
             doc.add_paragraph("Frames:")
             mapping = data["FramesMapping"]
         else:
             value = data.get("Frames", "N/A")
             doc.add_paragraph(f"**Frames:** {value}")
+        # --- New: Summary Table for Keywords, Themes, and Frames ---
         keywords = data.get("Keywords", [])
+        # Generate themes using DeepSeek-based function
         themes = suggest_themes(keywords) if keywords else []
         doc.add_paragraph("Summary Table:")
         summary_table = doc.add_table(rows=1, cols=3)
         summary_table.style = "Light List Accent 1"
         hdr_cells[0].text = "Keywords"
         hdr_cells[1].text = "Themes"
         hdr_cells[2].text = "Frames"
         row_cells = summary_table.add_row().cells
         row_cells[0].text = ", ".join(keywords) if keywords else "N/A"
         row_cells[1].text = ", ".join(themes) if themes else "N/A"
+        frames_from_mapping = data.get("FramesMapping", {})
+        frames_list = ", ".join([f"{frame} ({cat})" for frame, cat in frames_from_mapping.items()])
+        row_cells[2].text = frames_list if frames_list else "N/A"
         doc.add_paragraph("\n")
     return doc
+# --- Streamlit App UI ---
 st.title("AI-Powered Coding Sheet Generator")
 st.write("Enter text or upload a DOCX/Excel file for analysis:")
 if input_text:
     frame_mapping = get_frame_category_mapping(input_text)
     frames_table = format_frame_categories_table(frame_mapping)
+    # Use the DeepSeek-based keyword extraction
+    keywords = extract_keywords(input_text)
+    # For demonstration, reusing the extract_keywords for Tone as well (consider creating a dedicated tone function)
+    tone = extract_keywords(input_text)
     output_data["Manual Input"] = {
         "Full Caption": input_text,
         "Language": detect_language(input_text),
+        "Tone": tone,
         "Hashtags": extract_hashtags(input_text),
         "Frames": frames_table,
         "FramesMapping": frame_mapping,
+        "Keywords": keywords
     }
 if uploaded_docx:
     for caption, text in captions.items():
         frame_mapping = get_frame_category_mapping(text)
         frames_table = format_frame_categories_table(frame_mapping)
+        keywords = extract_keywords(text)
+        tone = extract_keywords(text)
         output_data[caption] = {
             "Full Caption": text,
             "Language": detect_language(text),
+            "Tone": tone,
             "Hashtags": extract_hashtags(text),
             "Frames": frames_table,
             "FramesMapping": frame_mapping,
+            "Keywords": keywords
         }
 if uploaded_excel:
     docx_io = io.BytesIO()
     docx_output.save(docx_io)
     docx_io.seek(0)
+    st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")