Spaces:

abid-ai
/

DocuMind_Intelligence

Running

App Files Files Community

abid-ai commited on 22 days ago

Commit

e68fa5e

verified ·

1 Parent(s): 1e8275b

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -214

app.py CHANGED Viewed

@@ -1,225 +1,140 @@
-import gradio as gr
-import json
-import plotly.express as px
-import pandas as pd
-from groq import Groq
-from fpdf import FPDF
-from youtube_comment_downloader import YoutubeCommentDownloader
-import re
 import os
-import warnings
-warnings.filterwarnings("ignore")
-# ====================== CONFIG ======================
-# On Hugging Face, go to Settings -> Variables and Secrets to add GROQ_API_KEY
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-if not GROQ_API_KEY:
-    print("❌ API Key not found! Please set GROQ_API_KEY in Space Secrets.")
-else:
-    print("✅ API Key loaded successfully!")
-# ====================== SYSTEM PROMPT ======================
-SYSTEM_PROMPT = """
-You are an expert social media sentiment and poll analysis AI.
-Focus on Yes/No, Agree/Disagree, Support/Oppose, and sentiment.
-Handle English + Urdu + Hindi + other languages well.
-Return ONLY valid JSON in this exact format:
-{
-  "main_poll": {
-    "question": "Suggested poll question",
-    "yes_count": int,
-    "no_count": int,
-    "agree_count": int,
-    "disagree_count": int,
-    "support_count": int,
-    "oppose_count": int,
-    "neutral_count": int
-  },
-  "sentiment": {
-    "positive": float,
-    "negative": float,
-    "neutral": float
-  },
-  "top_themes": ["theme1", "theme2"],
-  "summary": "Short professional summary",
-  "labeled_comments": [
-    {"comment": "...", "opinion": "Yes|No|Agree|Disagree|Positive|Negative|Neutral|Mixed"}
-  ]
-}
 """
-# ====================== TEXT CLEANING FOR PDF/UNICODE ======================
-def clean_text(text):
-    if not text:
-        return ""
-    # Replace problematic characters
-    text = re.sub(r'[\u2022\u2023\u25CF\u25BA\u25C4]', '-', text)  # bullets
-    text = re.sub(r'[\u2018\u2019\u201C\u201D]', '"', text)       # quotes
-    text = re.sub(r'[\u2013\u2014]', '-', text)                   # dashes
-    # Remove any remaining control characters or non-Latin1 for FPDF safety
-    text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
-    # Remove emojis/non-ASCII for the PDF generator (FPDF limitation)
-    return text.encode('ascii', 'ignore').decode('ascii').strip()
-# ====================== HELPER FUNCTIONS ======================
-def extract_youtube_id(url):
-    patterns = [
-        r'youtu\.be/([a-zA-Z0-9_-]+)',
-        r'v=([a-zA-Z0-9_-]+)',
-        r'/embed/([a-zA-Z0-9_-]+)',
-        r'/shorts/([a-zA-Z0-9_-]+)'
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, url)
-        if match:
-            return match.group(1)
-    return None
-def fetch_youtube_comments(url, limit=100):
     try:
-        video_id = extract_youtube_id(url)
-        if not video_id:
-            return []
-        downloader = YoutubeCommentDownloader()
-        comments = []
-        # get_comments is more stable on servers
-        generator = downloader.get_comments(video_id, sort_by=0)
-        for comment in generator:
-            comments.append(comment['text'])
-            if len(comments) >= limit:
-                break
-        return comments
     except Exception as e:
-        print(f"Fetch error: {e}")
-        return []
-def analyze_comments_with_groq(comments, post_context=""):
-    try:
-        client = Groq(api_key=GROQ_API_KEY)
-        # Clean comments and truncate to fit context window
-        cleaned_comments = [clean_text(c) for c in comments]
-        comments_text = "\n\n".join([f"C{i+1}: {c[:200]}" for i, c in enumerate(cleaned_comments)])
-        user_prompt = f"Post Context: {post_context}\n\nAnalyze these comments:\n{comments_text}"
-        response = client.chat.completions.create(
-            model="llama-3.3-70b-versatile",
-            messages=[
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": user_prompt}
-            ],
-            temperature=0.3,
-            max_tokens=3000,
-            response_format={"type": "json_object"}
-        )
-        return json.loads(response.choices[0].message.content)
-    except Exception as e:
-        print("Groq Error:", str(e))
-        return None
-def create_pdf_report(analysis_result, poll_question):
-    try:
-        pdf = FPDF()
-        pdf.add_page()
-        pdf.set_font('Arial', 'B', 16)
-        pdf.cell(0, 10, 'CommentSurvey AI Report', 0, 1, 'C')
-        pdf.ln(10)
-        pdf.set_font('Arial', 'B', 12)
-        pdf.cell(0, 10, f'Poll Question: {clean_text(poll_question)}', 0, 1, 'L')
-        pdf.ln(5)
-        pdf.set_font('Arial', 'B', 12)
-        pdf.cell(0, 10, 'Summary:', 0, 1, 'L')
-        pdf.set_font('Arial', '', 11)
-        pdf.multi_cell(0, 5, clean_text(analysis_result.get('summary', 'No summary.')))
-        pdf.ln(10)
-        pdf.output("CommentSurvey_Report.pdf")
-        return "CommentSurvey_Report.pdf"
-    except Exception as e:
-        print(f"PDF Error: {e}")
-        return None
-# ====================== MAIN ANALYSIS ======================
-def analyze(url):
-    try:
-        if not GROQ_API_KEY:
-            return None, "❌ API Key Missing in Settings!", None, None, None, None
-        if not url or not url.strip():
-            return None, "❌ Please paste a YouTube URL", None, None, None, None
-        comments = fetch_youtube_comments(url)
-        if not comments:
-            return None, "❌ Could not fetch comments (Video might be private or restricted).", None, None, None, None
-        result = analyze_comments_with_groq(comments)
-        if not result:
-            return None, "❌ AI Analysis failed.", None, None, None, None
-        main = result.get('main_poll', {})
-        poll_values = [
-            main.get('yes_count',0) + main.get('agree_count',0) + main.get('support_count',0),
-            main.get('no_count',0) + main.get('disagree_count',0) + main.get('oppose_count',0),
-            main.get('neutral_count',0)
-        ]
-        fig_poll = px.pie(
-            names=['Yes/Agree/Support', 'No/Disagree/Oppose', 'Neutral'],
-            values=poll_values,
-            title="Main Poll Results",
-            hole=0.4
-        )
-        sent = result.get('sentiment', {})
-        fig_sent = px.bar(
-            x=['Positive', 'Negative', 'Neutral'],
-            y=[sent.get('positive',0), sent.get('negative',0), sent.get('neutral',0)],
-            title="Sentiment Breakdown",
-            color=['Positive', 'Negative', 'Neutral']
-        )
-        summary_text = f"**Question:** {main.get('question','N/A')}\n\n**Summary:** {result.get('summary','')}"
-        pdf_path = create_pdf_report(result, main.get('question', 'Survey'))
-        raw_df = pd.DataFrame(result.get('labeled_comments', []))
-        return raw_df, f"✅ Analyzed {len(comments)} comments", fig_poll, fig_sent, summary_text, pdf_path
-    except Exception as e:
-        return None, f"❌ Error: {str(e)}", None, None, None, None
-# ====================== GRADIO UI ======================
-with gr.Blocks(title="CommentSurvey AI", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📊 CommentSurvey AI\n**Turn YouTube Comments into Smart Insights**")
-    with gr.Row():
-        url_input = gr.Textbox(label="🌐 YouTube Link", placeholder="Paste here...")
-        analyze_btn = gr.Button("🚀 Analyze", variant="primary")
-    status = gr.Markdown("**Status:** Ready")
-    with gr.Tabs():
-        with gr.Tab("📊 Results"):
-            poll_plot = gr.Plot()
-            poll_md = gr.Markdown()
-        with gr.Tab("😊 Sentiment"):
-            sentiment_plot = gr.Plot()
-        with gr.Tab("📜 Data"):
-            raw_table = gr.Dataframe()
-    download_btn = gr.File(label="📥 Download Report")
-    analyze_btn.click(
-        fn=analyze,
-        inputs=[url_input],
-        outputs=[raw_table, status, poll_plot, sentiment_plot, poll_md, download_btn]
-    )
-if __name__ == "__main__":
-    demo.launch()

 import os
+import gdown
+import time
+import gradio as gr
+from google.colab import userdata
+# Modern Imports
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+# ==========================================
+# 1. SETUP & KEYS
+# ==========================================
+os.environ["GROQ_API_KEY"] = userdata.get('ragapikey')
+# --- UPDATE THIS LIST WITH ALL YOUR LINKS ---
+links_to_process = [
+    "https://drive.google.com/file/d/1rb7AeJZrDNR-bq8Q9V4IvtzYZsDOvDH0/view?usp=sharing",
+    "https://drive.google.com/file/d/16PcJo_JaQHh1bx01lCAkc4QwQ6YnLb-K/view?usp=sharing"
+    #"https://drive.google.com/drive/folders/ANOTHER_FOLDER_ID"
+]
+output_dir = 'knowledge_base'
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+# ==========================================
+# 2. IMPROVED DOWNLOAD LOGIC
+# ==========================================
+def build_vector_db(links):
+    print(f"📥 Starting synchronization for {len(links)} sources...")
+    for link in links:
+        try:
+            if "/folders/" in link:
+                print(f"📂 Syncing Folder: {link}")
+                gdown.download_folder(url=link, output=output_dir, quiet=True, use_cookies=False)
+            else:
+                print(f"📄 Syncing Individual File: {link}")
+                # Use output_dir + "/" to ensure it saves into the folder
+                gdown.download(url=link, output=output_dir + "/", quiet=True)
+            time.sleep(1) # Small pause to respect Drive rate limits
+        except Exception as e:
+            print(f"⚠️ Skip Link: Could not download {link}. Error: {e}")
+    all_docs = []
+    # Use os.walk to find PDFs even inside subfolders downloaded by download_folder
+    for root, dirs, files in os.walk(output_dir):
+        for filename in files:
+            if filename.endswith(".pdf"):
+                file_path = os.path.join(root, filename)
+                try:
+                    loader = PyPDFLoader(file_path)
+                    all_docs.extend(loader.load())
+                except Exception as e:
+                    print(f"❌ Error loading {filename}: {e}")
+    if not all_docs:
+        raise ValueError("No PDF documents found! Ensure links are set to 'Anyone with the link'.")
+    # Chunking & Embeddings
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
+    chunks = text_splitter.split_documents(all_docs)
+    print(f"🧠 Creating embeddings for {len(chunks)} text chunks...")
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    vector_db = FAISS.from_documents(chunks, embeddings)
+    print("✅ Multi-Source Vector Database Created Successfully!")
+    return vector_db
+# Initialize
+vector_store = build_vector_db(links_to_process)
+retriever = vector_store.as_retriever(search_kwargs={"k": 3})
+# ==========================================
+# 3. MODERN RAG CHAIN
+# ==========================================
+llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)
+template = """Answer the question based ONLY on the following context:
+{context}
+Question: {question}
+Helpful Answer:"""
+prompt = ChatPromptTemplate.from_template(template)
+rag_chain = (
+    {"context": retriever, "question": RunnablePassthrough()}
+    | prompt
+    | llm
+    | StrOutputParser()
+)
+# ==========================================
+# 4. PROFESSIONAL FRONTEND (GRADIO BLOCKS)
+# ==========================================
+custom_css = """
+#main-container { max-width: 900px; margin: auto; padding: 20px; }
+.header-text { text-align: center; color: #1e293b; margin-bottom: 2px; }
+.report-box { background-color: #ffffff; border-radius: 8px; border: 1px solid #e2e8f0; padding: 15px; min-height: 200px; }
 """
+def process_query(query):
+    if not query.strip():
+        return "### ⚠️ System Note\n*Please enter a strategic inquiry to begin analysis.*"
     try:
+        return rag_chain.invoke(query)
     except Exception as e:
+        return f"### ❌ Error\nAn error occurred: {str(e)}"
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo"), css=custom_css) as demo:
+    with gr.Column(elem_id="main-container"):
+        gr.Markdown("# 🏛️ Enterprise Knowledge Engine", elem_classes="header-text")
+        gr.Markdown("<p style='text-align: center;'>Multi-Source Document Synthesis via Groq & FAISS</p>")
+        gr.HTML("<hr>")
+        user_input = gr.Textbox(label="Strategic Inquiry", placeholder="Ask a question about the collected knowledge base...", lines=3)
+        with gr.Row():
+            submit_btn = gr.Button("ANALYZE DATA", variant="primary", scale=2)
+            clear_btn = gr.ClearButton([user_input], value="RESET DASHBOARD", scale=1)
+        gr.Markdown("### 📋 Intelligence Report")
+        with gr.Column(elem_classes="report-box"):
+            output_display = gr.Markdown(value="_Awaiting input..._")
+    submit_btn.click(fn=process_query, inputs=user_input, outputs=output_display)
+    user_input.submit(fn=process_query, inputs=user_input, outputs=output_display)
+demo.launch(share=True)