abid-ai commited on
Commit
e68fa5e
ยท
verified ยท
1 Parent(s): 1e8275b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -214
app.py CHANGED
@@ -1,225 +1,140 @@
1
- import gradio as gr
2
- import json
3
- import plotly.express as px
4
- import pandas as pd
5
- from groq import Groq
6
- from fpdf import FPDF
7
- from youtube_comment_downloader import YoutubeCommentDownloader
8
- import re
9
  import os
10
- import warnings
11
-
12
- warnings.filterwarnings("ignore")
13
-
14
- # ====================== CONFIG ======================
15
- # On Hugging Face, go to Settings -> Variables and Secrets to add GROQ_API_KEY
16
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
17
-
18
- if not GROQ_API_KEY:
19
- print("โŒ API Key not found! Please set GROQ_API_KEY in Space Secrets.")
20
- else:
21
- print("โœ… API Key loaded successfully!")
22
-
23
- # ====================== SYSTEM PROMPT ======================
24
- SYSTEM_PROMPT = """
25
- You are an expert social media sentiment and poll analysis AI.
26
- Focus on Yes/No, Agree/Disagree, Support/Oppose, and sentiment.
27
-
28
- Handle English + Urdu + Hindi + other languages well.
29
- Return ONLY valid JSON in this exact format:
30
- {
31
- "main_poll": {
32
- "question": "Suggested poll question",
33
- "yes_count": int,
34
- "no_count": int,
35
- "agree_count": int,
36
- "disagree_count": int,
37
- "support_count": int,
38
- "oppose_count": int,
39
- "neutral_count": int
40
- },
41
- "sentiment": {
42
- "positive": float,
43
- "negative": float,
44
- "neutral": float
45
- },
46
- "top_themes": ["theme1", "theme2"],
47
- "summary": "Short professional summary",
48
- "labeled_comments": [
49
- {"comment": "...", "opinion": "Yes|No|Agree|Disagree|Positive|Negative|Neutral|Mixed"}
50
- ]
51
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
 
54
- # ====================== TEXT CLEANING FOR PDF/UNICODE ======================
55
- def clean_text(text):
56
- if not text:
57
- return ""
58
- # Replace problematic characters
59
- text = re.sub(r'[\u2022\u2023\u25CF\u25BA\u25C4]', '-', text) # bullets
60
- text = re.sub(r'[\u2018\u2019\u201C\u201D]', '"', text) # quotes
61
- text = re.sub(r'[\u2013\u2014]', '-', text) # dashes
62
- # Remove any remaining control characters or non-Latin1 for FPDF safety
63
- text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
64
- # Remove emojis/non-ASCII for the PDF generator (FPDF limitation)
65
- return text.encode('ascii', 'ignore').decode('ascii').strip()
66
-
67
- # ====================== HELPER FUNCTIONS ======================
68
- def extract_youtube_id(url):
69
- patterns = [
70
- r'youtu\.be/([a-zA-Z0-9_-]+)',
71
- r'v=([a-zA-Z0-9_-]+)',
72
- r'/embed/([a-zA-Z0-9_-]+)',
73
- r'/shorts/([a-zA-Z0-9_-]+)'
74
- ]
75
- for pattern in patterns:
76
- match = re.search(pattern, url)
77
- if match:
78
- return match.group(1)
79
- return None
80
-
81
- def fetch_youtube_comments(url, limit=100):
82
  try:
83
- video_id = extract_youtube_id(url)
84
- if not video_id:
85
- return []
86
-
87
- downloader = YoutubeCommentDownloader()
88
- comments = []
89
- # get_comments is more stable on servers
90
- generator = downloader.get_comments(video_id, sort_by=0)
91
-
92
- for comment in generator:
93
- comments.append(comment['text'])
94
- if len(comments) >= limit:
95
- break
96
- return comments
97
  except Exception as e:
98
- print(f"Fetch error: {e}")
99
- return []
100
-
101
- def analyze_comments_with_groq(comments, post_context=""):
102
- try:
103
- client = Groq(api_key=GROQ_API_KEY)
104
- # Clean comments and truncate to fit context window
105
- cleaned_comments = [clean_text(c) for c in comments]
106
- comments_text = "\n\n".join([f"C{i+1}: {c[:200]}" for i, c in enumerate(cleaned_comments)])
107
-
108
- user_prompt = f"Post Context: {post_context}\n\nAnalyze these comments:\n{comments_text}"
109
-
110
- response = client.chat.completions.create(
111
- model="llama-3.3-70b-versatile",
112
- messages=[
113
- {"role": "system", "content": SYSTEM_PROMPT},
114
- {"role": "user", "content": user_prompt}
115
- ],
116
- temperature=0.3,
117
- max_tokens=3000,
118
- response_format={"type": "json_object"}
119
- )
120
- return json.loads(response.choices[0].message.content)
121
- except Exception as e:
122
- print("Groq Error:", str(e))
123
- return None
124
-
125
- def create_pdf_report(analysis_result, poll_question):
126
- try:
127
- pdf = FPDF()
128
- pdf.add_page()
129
- pdf.set_font('Arial', 'B', 16)
130
- pdf.cell(0, 10, 'CommentSurvey AI Report', 0, 1, 'C')
131
- pdf.ln(10)
132
-
133
- pdf.set_font('Arial', 'B', 12)
134
- pdf.cell(0, 10, f'Poll Question: {clean_text(poll_question)}', 0, 1, 'L')
135
- pdf.ln(5)
136
-
137
- pdf.set_font('Arial', 'B', 12)
138
- pdf.cell(0, 10, 'Summary:', 0, 1, 'L')
139
- pdf.set_font('Arial', '', 11)
140
- pdf.multi_cell(0, 5, clean_text(analysis_result.get('summary', 'No summary.')))
141
- pdf.ln(10)
142
-
143
- pdf.output("CommentSurvey_Report.pdf")
144
- return "CommentSurvey_Report.pdf"
145
- except Exception as e:
146
- print(f"PDF Error: {e}")
147
- return None
148
-
149
- # ====================== MAIN ANALYSIS ======================
150
- def analyze(url):
151
- try:
152
- if not GROQ_API_KEY:
153
- return None, "โŒ API Key Missing in Settings!", None, None, None, None
154
-
155
- if not url or not url.strip():
156
- return None, "โŒ Please paste a YouTube URL", None, None, None, None
157
-
158
- comments = fetch_youtube_comments(url)
159
- if not comments:
160
- return None, "โŒ Could not fetch comments (Video might be private or restricted).", None, None, None, None
161
-
162
- result = analyze_comments_with_groq(comments)
163
- if not result:
164
- return None, "โŒ AI Analysis failed.", None, None, None, None
165
-
166
- main = result.get('main_poll', {})
167
- poll_values = [
168
- main.get('yes_count',0) + main.get('agree_count',0) + main.get('support_count',0),
169
- main.get('no_count',0) + main.get('disagree_count',0) + main.get('oppose_count',0),
170
- main.get('neutral_count',0)
171
- ]
172
-
173
- fig_poll = px.pie(
174
- names=['Yes/Agree/Support', 'No/Disagree/Oppose', 'Neutral'],
175
- values=poll_values,
176
- title="Main Poll Results",
177
- hole=0.4
178
- )
179
-
180
- sent = result.get('sentiment', {})
181
- fig_sent = px.bar(
182
- x=['Positive', 'Negative', 'Neutral'],
183
- y=[sent.get('positive',0), sent.get('negative',0), sent.get('neutral',0)],
184
- title="Sentiment Breakdown",
185
- color=['Positive', 'Negative', 'Neutral']
186
- )
187
-
188
- summary_text = f"**Question:** {main.get('question','N/A')}\n\n**Summary:** {result.get('summary','')}"
189
- pdf_path = create_pdf_report(result, main.get('question', 'Survey'))
190
- raw_df = pd.DataFrame(result.get('labeled_comments', []))
191
-
192
- return raw_df, f"โœ… Analyzed {len(comments)} comments", fig_poll, fig_sent, summary_text, pdf_path
193
-
194
- except Exception as e:
195
- return None, f"โŒ Error: {str(e)}", None, None, None, None
196
-
197
- # ====================== GRADIO UI ======================
198
- with gr.Blocks(title="CommentSurvey AI", theme=gr.themes.Soft()) as demo:
199
- gr.Markdown("# ๐Ÿ“Š CommentSurvey AI\n**Turn YouTube Comments into Smart Insights**")
200
 
201
- with gr.Row():
202
- url_input = gr.Textbox(label="๐ŸŒ YouTube Link", placeholder="Paste here...")
203
- analyze_btn = gr.Button("๐Ÿš€ Analyze", variant="primary")
 
 
204
 
205
- status = gr.Markdown("**Status:** Ready")
206
 
207
- with gr.Tabs():
208
- with gr.Tab("๐Ÿ“Š Results"):
209
- poll_plot = gr.Plot()
210
- poll_md = gr.Markdown()
211
- with gr.Tab("๐Ÿ˜Š Sentiment"):
212
- sentiment_plot = gr.Plot()
213
- with gr.Tab("๐Ÿ“œ Data"):
214
- raw_table = gr.Dataframe()
215
 
216
- download_btn = gr.File(label="๐Ÿ“ฅ Download Report")
 
 
217
 
218
- analyze_btn.click(
219
- fn=analyze,
220
- inputs=[url_input],
221
- outputs=[raw_table, status, poll_plot, sentiment_plot, poll_md, download_btn]
222
- )
223
 
224
- if __name__ == "__main__":
225
- demo.launch()
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import gdown
3
+ import time
4
+ import gradio as gr
5
+ from google.colab import userdata
6
+
7
+ # Modern Imports
8
+ from langchain_community.document_loaders import PyPDFLoader
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+ from langchain_huggingface import HuggingFaceEmbeddings
11
+ from langchain_community.vectorstores import FAISS
12
+ from langchain_groq import ChatGroq
13
+ from langchain_core.prompts import ChatPromptTemplate
14
+ from langchain_core.runnables import RunnablePassthrough
15
+ from langchain_core.output_parsers import StrOutputParser
16
+
17
+ # ==========================================
18
+ # 1. SETUP & KEYS
19
+ # ==========================================
20
+ os.environ["GROQ_API_KEY"] = userdata.get('ragapikey')
21
+
22
+ # --- UPDATE THIS LIST WITH ALL YOUR LINKS ---
23
+ links_to_process = [
24
+ "https://drive.google.com/file/d/1rb7AeJZrDNR-bq8Q9V4IvtzYZsDOvDH0/view?usp=sharing",
25
+ "https://drive.google.com/file/d/16PcJo_JaQHh1bx01lCAkc4QwQ6YnLb-K/view?usp=sharing"
26
+ #"https://drive.google.com/drive/folders/ANOTHER_FOLDER_ID"
27
+ ]
28
+
29
+ output_dir = 'knowledge_base'
30
+ if not os.path.exists(output_dir):
31
+ os.makedirs(output_dir)
32
+
33
+ # ==========================================
34
+ # 2. IMPROVED DOWNLOAD LOGIC
35
+ # ==========================================
36
+ def build_vector_db(links):
37
+ print(f"๐Ÿ“ฅ Starting synchronization for {len(links)} sources...")
38
+
39
+ for link in links:
40
+ try:
41
+ if "/folders/" in link:
42
+ print(f"๐Ÿ“‚ Syncing Folder: {link}")
43
+ gdown.download_folder(url=link, output=output_dir, quiet=True, use_cookies=False)
44
+ else:
45
+ print(f"๐Ÿ“„ Syncing Individual File: {link}")
46
+ # Use output_dir + "/" to ensure it saves into the folder
47
+ gdown.download(url=link, output=output_dir + "/", quiet=True)
48
+
49
+ time.sleep(1) # Small pause to respect Drive rate limits
50
+ except Exception as e:
51
+ print(f"โš ๏ธ Skip Link: Could not download {link}. Error: {e}")
52
+
53
+ all_docs = []
54
+ # Use os.walk to find PDFs even inside subfolders downloaded by download_folder
55
+ for root, dirs, files in os.walk(output_dir):
56
+ for filename in files:
57
+ if filename.endswith(".pdf"):
58
+ file_path = os.path.join(root, filename)
59
+ try:
60
+ loader = PyPDFLoader(file_path)
61
+ all_docs.extend(loader.load())
62
+ except Exception as e:
63
+ print(f"โŒ Error loading {filename}: {e}")
64
+
65
+ if not all_docs:
66
+ raise ValueError("No PDF documents found! Ensure links are set to 'Anyone with the link'.")
67
+
68
+ # Chunking & Embeddings
69
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
70
+ chunks = text_splitter.split_documents(all_docs)
71
+
72
+ print(f"๐Ÿง  Creating embeddings for {len(chunks)} text chunks...")
73
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
74
+
75
+ vector_db = FAISS.from_documents(chunks, embeddings)
76
+ print("โœ… Multi-Source Vector Database Created Successfully!")
77
+ return vector_db
78
+
79
+ # Initialize
80
+ vector_store = build_vector_db(links_to_process)
81
+ retriever = vector_store.as_retriever(search_kwargs={"k": 3})
82
+
83
+ # ==========================================
84
+ # 3. MODERN RAG CHAIN
85
+ # ==========================================
86
+ llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)
87
+
88
+ template = """Answer the question based ONLY on the following context:
89
+ {context}
90
+
91
+ Question: {question}
92
+
93
+ Helpful Answer:"""
94
+
95
+ prompt = ChatPromptTemplate.from_template(template)
96
+
97
+ rag_chain = (
98
+ {"context": retriever, "question": RunnablePassthrough()}
99
+ | prompt
100
+ | llm
101
+ | StrOutputParser()
102
+ )
103
+
104
+ # ==========================================
105
+ # 4. PROFESSIONAL FRONTEND (GRADIO BLOCKS)
106
+ # ==========================================
107
+ custom_css = """
108
+ #main-container { max-width: 900px; margin: auto; padding: 20px; }
109
+ .header-text { text-align: center; color: #1e293b; margin-bottom: 2px; }
110
+ .report-box { background-color: #ffffff; border-radius: 8px; border: 1px solid #e2e8f0; padding: 15px; min-height: 200px; }
111
  """
112
 
113
+ def process_query(query):
114
+ if not query.strip():
115
+ return "### โš ๏ธ System Note\n*Please enter a strategic inquiry to begin analysis.*"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  try:
117
+ return rag_chain.invoke(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  except Exception as e:
119
+ return f"### โŒ Error\nAn error occurred: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo"), css=custom_css) as demo:
122
+ with gr.Column(elem_id="main-container"):
123
+ gr.Markdown("# ๐Ÿ›๏ธ Enterprise Knowledge Engine", elem_classes="header-text")
124
+ gr.Markdown("<p style='text-align: center;'>Multi-Source Document Synthesis via Groq & FAISS</p>")
125
+ gr.HTML("<hr>")
126
 
127
+ user_input = gr.Textbox(label="Strategic Inquiry", placeholder="Ask a question about the collected knowledge base...", lines=3)
128
 
129
+ with gr.Row():
130
+ submit_btn = gr.Button("ANALYZE DATA", variant="primary", scale=2)
131
+ clear_btn = gr.ClearButton([user_input], value="RESET DASHBOARD", scale=1)
 
 
 
 
 
132
 
133
+ gr.Markdown("### ๐Ÿ“‹ Intelligence Report")
134
+ with gr.Column(elem_classes="report-box"):
135
+ output_display = gr.Markdown(value="_Awaiting input..._")
136
 
137
+ submit_btn.click(fn=process_query, inputs=user_input, outputs=output_display)
138
+ user_input.submit(fn=process_query, inputs=user_input, outputs=output_display)
 
 
 
139
 
140
+ demo.launch(share=True)