Spaces:

ahm14
/

Summary_Generator

Sleeping

App Files Files Community

ahm14 commited on Mar 15, 2025

Commit

ba880a7

verified ·

1 Parent(s): eccdab3

Update app.py

Browse files

Files changed (1) hide show

app.py +332 -62

app.py CHANGED Viewed

@@ -5,85 +5,355 @@ import re
 import logging
 import nltk
 from docx import Document
-from collections import Counter
 import io
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 # Initialize logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # Download required NLTK resources
 nltk.download("punkt")
 st.title("AI-Powered Coding Sheet Generator")
-st.write("Enter text or upload a DOCX/Excel file for analysis:")
-# Option to enable separate tab feature
-separate_tab = st.checkbox("Enable Separate Tab for Summary")
-input_text = st.text_area("Input Text", height=200)
-uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
-uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
-output_data = {}
-# Function to extract text from DOCX
-def extract_text_from_docx(docx_file):
-    doc = Document(docx_file)
-    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
-# Function to analyze summary data
-def generate_summary(data):
-    total_posts = len(data)
-    tones = Counter()
-    languages = Counter()
-    frames = Counter()
-    frame_focus = {"Major Focus": Counter(), "Significant Focus": Counter(), "Minor Mention": Counter(), "Not Applicable": Counter()}
-    for post in data.values():
-        tones.update(post.get("Tone", []))
-        languages[post.get("Language", "Unknown")] += 1
-        frame_mapping = post.get("FramesMapping", {})
-        for frame, focus in frame_mapping.items():
-            frames[frame] += 1
-            frame_focus[focus][frame] += 1
-    abstract = f"This document contains {total_posts} posts. The most commonly used tone is '{tones.most_common(1)}'. "
-    abstract += f"The most frequently mentioned frame is '{frames.most_common(1)}'. Languages used include {list(languages.keys())}."
-    return total_posts, tones, languages, frames, frame_focus, abstract
-# Function to create an Excel summary
-def create_summary_excel(summary_data):
-    total_posts, tones, languages, frames, frame_focus, abstract = summary_data
-    with io.BytesIO() as buffer:
-        writer = pd.ExcelWriter(buffer, engine='xlsxwriter')
-        pd.DataFrame(tones.items(), columns=["Tone", "Count"]).to_excel(writer, sheet_name="Tones", index=False)
-        pd.DataFrame(languages.items(), columns=["Language", "Count"]).to_excel(writer, sheet_name="Languages", index=False)
-        pd.DataFrame(frames.items(), columns=["Frame", "Count"]).to_excel(writer, sheet_name="Frames", index=False)
-        for focus, counts in frame_focus.items():
-            pd.DataFrame(counts.items(), columns=["Frame", "Count"]).to_excel(writer, sheet_name=focus, index=False)
-        pd.DataFrame({"Abstract": [abstract]}).to_excel(writer, sheet_name="Abstract", index=False)
-        writer.close()
-        buffer.seek(0)
-        return buffer.getvalue()
-if uploaded_docx:
-    docx_text = extract_text_from_docx(uploaded_docx)
-    summary_data = generate_summary({"Uploaded DOCX": {"Full Caption": docx_text}})
-    if separate_tab:
-        with st.expander("Summary Tab"):
-            st.write(f"Total Posts: {summary_data[0]}")
-            st.write(f"Tones: {dict(summary_data[1])}")
-            st.write(f"Languages: {dict(summary_data[2])}")
-            st.write(f"Frames: {dict(summary_data[3])}")
-            st.write(f"Abstract: {summary_data[5]}")
-        excel_data = create_summary_excel(summary_data)
-        st.download_button("Download Summary as Excel", data=excel_data, file_name="summary.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")

 import logging
 import nltk
 from docx import Document
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.shared import Pt
 import io
+from langdetect import detect
+from collections import Counter
 from dotenv import load_dotenv
+from langchain_groq import ChatGroq
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from transformers import pipeline
 # Load environment variables
 load_dotenv()
+# Check if Groq API key is available
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+if not GROQ_API_KEY:
+    logging.error("Missing Groq API key. Please set the GROQ_API_KEY environment variable.")
+    st.error("API key is missing. Please provide a valid API key.")
 # Initialize logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Initialize LLM (Groq API)
+llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192")
 # Download required NLTK resources
 nltk.download("punkt")
+# Streamlit App UI
 st.title("AI-Powered Coding Sheet Generator")
+tabs = st.tabs(["Text Analysis", "DOCX Processing"])
+with tabs[0]:
+    st.write("Enter text or upload a DOCX/Excel file for analysis:")
+    input_text = st.text_area("Input Text", height=200)
+    uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"], key="docx1")
+    uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
+    # Existing processing logic...
+    # Tone categories for fallback method
+    tone_categories = {
+        "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
+        "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
+        "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
+        "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
+        "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
+        "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
+        "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
+        "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
+        "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
+        "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
+    }
+    # Frame categories for fallback method
+    frame_categories = {
+        "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
+        "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
+        "Gender & Patriarchy": ["gender", "women", "violence", "patriarchy", "equality"],
+        "Religious Freedom & Persecution": ["religion", "persecution", "minorities", "intolerance", "faith"],
+        "Grassroots Mobilization": ["activism", "community", "movement", "local", "mobilization"],
+        "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
+        "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
+        "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
+        "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
+        "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
+        "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
+        "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
+        "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
+        "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
+        "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
+    }
+    # Detect language
+    def detect_language(text):
+        try:
+            return detect(text)
+        except Exception as e:
+            logging.error(f"Error detecting language: {e}")
+            return "unknown"
+    # Extract tone using Groq API (or fallback method)
+    def extract_tone(text):
+        try:
+            response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
+                                 {"role": "user", "content": text}])
+            return response["choices"][0]["message"]["content"].split(", ")
+        except Exception as e:
+            logging.error(f"Groq API error: {e}")
+            return extract_tone_fallback(text)
+    # Fallback method for tone extraction
+    def extract_tone_fallback(text):
+        detected_tones = set()
+        text_lower = text.lower()
+        for category, keywords in tone_categories.items():
+            if any(word in text_lower for word in keywords):
+                detected_tones.add(category)
+        return list(detected_tones) if detected_tones else ["Neutral"]
+    # Extract hashtags
+    def extract_hashtags(text):
+        return re.findall(r"#\w+", text)
+    # -------------------------------------------------------------------
+    # New functions for frame categorization and display
+    # -------------------------------------------------------------------
+    def get_frame_category_mapping(text):
+        """
+        Returns a mapping of every frame (from frame_categories) to one of the four categories.
+        Detected frames are assigned a focus level based on keyword frequency:
+          - Top detected: "Major Focus"
+          - Next up to two: "Significant Focus"
+          - Remaining detected: "Minor Mention"
+        Frames not detected get "Not Applicable".
+        """
+        text_lower = text.lower()
+        # Calculate frequency for each frame
+        frame_freq = {}
+        for frame, keywords in frame_categories.items():
+            freq = sum(1 for word in keywords if word in text_lower)
+            frame_freq[frame] = freq
+        # Identify detected frames (frequency > 0) and sort descending
+        detected = [(frame, freq) for frame, freq in frame_freq.items() if freq > 0]
+        detected.sort(key=lambda x: x[1], reverse=True)
+        category_mapping = {}
+        if detected:
+            # Highest frequency frame as Major Focus
+            category_mapping[detected[0][0]] = "Major Focus"
+            # Next up to two frames as Significant Focus
+            for frame, _ in detected[1:3]:
+                category_mapping[frame] = "Significant Focus"
+            # Remaining detected frames as Minor Mention
+            for frame, _ in detected[3:]:
+                category_mapping[frame] = "Minor Mention"
+        # For frames not detected, assign Not Applicable
+        for frame in frame_categories.keys():
+            if frame not in category_mapping:
+                category_mapping[frame] = "Not Applicable"
+        return category_mapping
+    def format_frame_categories_table(category_mapping):
+        """
+        Returns a markdown-formatted table displaying each frame with columns:
+        Major Focus, Significant Focus, Minor Mention, and Not Applicable.
+        A tick (✓) marks the assigned category.
+        """
+        header = "| Frame | Major Focus | Significant Focus | Minor Mention | Not Applicable |\n"
+        header += "| --- | --- | --- | --- | --- |\n"
+        tick = "✓"
+        rows = ""
+        for frame, category in category_mapping.items():
+            major = tick if category == "Major Focus" else ""
+            significant = tick if category == "Significant Focus" else ""
+            minor = tick if category == "Minor Mention" else ""
+            not_applicable = tick if category == "Not Applicable" else ""
+            rows += f"| {frame} | {major} | {significant} | {minor} | {not_applicable} |\n"
+        return header + rows
+    # -------------------------------------------------------------------
+    # Existing functions for file processing
+    # -------------------------------------------------------------------
+    def extract_captions_from_docx(docx_file):
+        doc = Document(docx_file)
+        captions = {}
+        current_post = None
+        for para in doc.paragraphs:
+            text = para.text.strip()
+            if re.match(r"Post \d+", text, re.IGNORECASE):
+                current_post = text
+                captions[current_post] = []
+            elif current_post:
+                captions[current_post].append(text)
+        return {post: " ".join(lines) for post, lines in captions.items() if lines}
+    def extract_metadata_from_excel(excel_file):
+        try:
+            df = pd.read_excel(excel_file)
+            extracted_data = df.to_dict(orient="records")
+            return extracted_data
+        except Exception as e:
+            logging.error(f"Error processing Excel file: {e}")
+            return []
+    def merge_metadata_with_generated_data(generated_data, excel_metadata):
+        for post_data in excel_metadata:
+            post_number = f"Post {post_data.get('Post Number', len(generated_data) + 1)}"
+            if post_number in generated_data:
+                generated_data[post_number].update(post_data)
+            else:
+                generated_data[post_number] = post_data
+        return generated_data
+    def create_docx_from_data(extracted_data):
+        doc = Document()
+        for post_number, data in extracted_data.items():
+            doc.add_heading(post_number, level=1)
+            ordered_keys = [
+                "Post Number", "Date of Post", "Media Type", "Number of Pictures",
+                "Number of Videos", "Number of Audios", "Likes", "Comments", "Tagged Audience",
+                "Full Caption", "Language", "Tone", "Hashtags"
+            ]
+            for key in ordered_keys:
+                value = data.get(key, "N/A")
+                if key in ["Tone", "Hashtags"]:
+                    value = ", ".join(value) if isinstance(value, list) else value
+                para = doc.add_paragraph()
+                run = para.add_run(f"**{key}:** {value}")
+                run.font.size = Pt(11)
+            # Add a proper table for Frames if a mapping is available.
+            if "FramesMapping" in data:
+                doc.add_paragraph("Frames:")
+                mapping = data["FramesMapping"]
+                table = doc.add_table(rows=1, cols=5)
+                table.style = "Light List Accent 1"
+                hdr_cells = table.rows[0].cells
+                hdr_cells[0].text = "Frame"
+                hdr_cells[1].text = "Major Focus"
+                hdr_cells[2].text = "Significant Focus"
+                hdr_cells[3].text = "Minor Mention"
+                hdr_cells[4].text = "Not Applicable"
+                tick = "✓"
+                for frame, category in mapping.items():
+                    row_cells = table.add_row().cells
+                    row_cells[0].text = frame
+                    row_cells[1].text = tick if category == "Major Focus" else ""
+                    row_cells[2].text = tick if category == "Significant Focus" else ""
+                    row_cells[3].text = tick if category == "Minor Mention" else ""
+                    row_cells[4].text = tick if category == "Not Applicable" else ""
+            else:
+                value = data.get("Frames", "N/A")
+                doc.add_paragraph(f"**Frames:** {value}")
+            doc.add_paragraph("\n")
+        return doc
+    # -------------------------------------------------------------------
+    # Streamlit App UI
+    # -------------------------------------------------------------------
+    st.title("AI-Powered Coding Sheet Generator")
+    st.write("Enter text or upload a DOCX/Excel file for analysis:")
+    input_text = st.text_area("Input Text", height=200)
+    uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
+    uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
+    output_data = {}
+    if input_text:
+        frame_mapping = get_frame_category_mapping(input_text)
+        frames_table = format_frame_categories_table(frame_mapping)
+        output_data["Manual Input"] = {
+            "Full Caption": input_text,
+            "Language": detect_language(input_text),
+            "Tone": extract_tone(input_text),
+            "Hashtags": extract_hashtags(input_text),
+            "Frames": frames_table,
+            "FramesMapping": frame_mapping
+        }
+    if uploaded_docx:
+        captions = extract_captions_from_docx(uploaded_docx)
+        for caption, text in captions.items():
+            frame_mapping = get_frame_category_mapping(text)
+            frames_table = format_frame_categories_table(frame_mapping)
+            output_data[caption] = {
+                "Full Caption": text,
+                "Language": detect_language(text),
+                "Tone": extract_tone(text),
+                "Hashtags": extract_hashtags(text),
+                "Frames": frames_table,
+                "FramesMapping": frame_mapping
+            }
+    if uploaded_excel:
+        excel_metadata = extract_metadata_from_excel(uploaded_excel)
+        output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
+    if output_data:
+        for post_number, data in output_data.items():
+            with st.expander(post_number):
+                for key, value in data.items():
+                    if key == "Frames":
+                        st.markdown(f"**{key}:**\n{value}")
+                    else:
+                        st.write(f"**{key}:** {value}")
+    if output_data:
+        docx_output = create_docx_from_data(output_data)
+        docx_io = io.BytesIO()
+        docx_output.save(docx_io)
+        docx_io.seek(0)
+        st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="coding_sheet.docx")
+with tabs[1]:
+    st.write("Upload a DOCX file for document-wide processing:")
+    uploaded_docx2 = st.file_uploader("Upload a DOCX file", type=["docx"], key="docx2")
+    if uploaded_docx2:
+        doc = Document(uploaded_docx2)
+        texts = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
+        # Count total posts
+        total_posts = sum(1 for t in texts if re.match(r"Post \d+", t))
+        # Process tone, language, and frames
+        tones = []
+        languages = []
+        frames_count = Counter()
+        frame_focus_count = Counter()
+        for text in texts:
+            detected_tones = extract_tone(text)
+            tones.extend(detected_tones)
+            detected_language = detect_language(text)
+            languages.append(detected_language)
+            frame_mapping = get_frame_category_mapping(text)
+            for frame, category in frame_mapping.items():
+                frames_count[frame] += 1
+                frame_focus_count[category] += 1
+        # Generate Summary
+        summary = f"Total Posts: {total_posts}\n"
+        summary += f"Detected Tones: {Counter(tones)}\n"
+        summary += f"Languages Used: {Counter(languages)}\n"
+        summary += f"Frame Distribution: {frames_count}\n"
+        summary += f"Frame Focus Levels: {frame_focus_count}\n"
+        st.write("## Document Summary")
+        st.text(summary)
+        # Create an Excel file
+        df = pd.DataFrame({
+            "Frame": list(frames_count.keys()),
+            "Count": list(frames_count.values()),
+            "Major Focus": [frame_focus_count.get("Major Focus", 0)] * len(frames_count),
+            "Significant Focus": [frame_focus_count.get("Significant Focus", 0)] * len(frames_count),
+            "Minor Mention": [frame_focus_count.get("Minor Mention", 0)] * len(frames_count),
+            "Not Applicable": [frame_focus_count.get("Not Applicable", 0)] * len(frames_count),
+        })
+        excel_io = io.BytesIO()
+        with pd.ExcelWriter(excel_io, engine='xlsxwriter') as writer:
+            df.to_excel(writer, index=False, sheet_name='Frame Analysis')
+        excel_io.seek(0)
+        st.download_button("Download Analysis as Excel", data=excel_io, file_name="document_analysis.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")