Refat81 commited on
Commit
d395d4e
Β·
verified Β·
1 Parent(s): fd2cc7f

Update pages/facebook_extractor.py

Browse files
Files changed (1) hide show
  1. pages/facebook_extractor.py +559 -121
pages/facebook_extractor.py CHANGED
@@ -9,13 +9,14 @@ from typing import List, Dict
9
  import os
10
  import tempfile
11
 
12
- from langchain.text_splitter import CharacterTextSplitter
13
- from langchain.embeddings import HuggingFaceInstructEmbeddings
 
14
  from langchain.vectorstores import FAISS
15
  from langchain.memory import ConversationBufferMemory
16
  from langchain.chains import ConversationalRetrievalChain
17
  from langchain.schema import Document
18
- from langchain.chat_models import ChatHuggingFaceHub
19
 
20
  st.set_page_config(
21
  page_title="Facebook Data Extractor",
@@ -34,10 +35,12 @@ class FacebookDataSimulator:
34
  try:
35
  st.info(f"πŸ” Analyzing: {url}")
36
 
 
37
  real_data = self._try_real_extraction(url)
38
  if real_data.get("status") == "success":
39
  return real_data
40
 
 
41
  st.warning("⚠️ Using demo data (Facebook restrictions active)")
42
  return self._get_demo_data(url, data_type)
43
 
@@ -46,15 +49,29 @@ class FacebookDataSimulator:
46
  return self._get_demo_data(url, data_type)
47
 
48
  def _try_real_extraction(self, url: str) -> Dict:
 
49
  try:
 
50
  headers = {
51
- 'User-Agent': 'Mozilla/5.0',
 
 
 
 
 
 
52
  }
 
 
53
  response = requests.get(url, headers=headers, timeout=10, verify=False)
 
54
  if response.status_code == 200:
55
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
56
  title = soup.find('title')
57
  description = soup.find('meta', attrs={'name': 'description'})
 
58
  return {
59
  "page_info": {
60
  "title": title.text if title else "Facebook Content",
@@ -71,13 +88,16 @@ class FacebookDataSimulator:
71
  }
72
  else:
73
  return {"status": "error", "source": "real"}
 
74
  except Exception:
75
  return {"status": "error", "source": "real"}
76
 
77
  def _extract_real_content(self, soup) -> List[Dict]:
 
78
  blocks = []
79
  text = soup.get_text()
80
  paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
 
81
  for i, paragraph in enumerate(paragraphs[:8]):
82
  blocks.append({
83
  "id": i + 1,
@@ -87,10 +107,13 @@ class FacebookDataSimulator:
87
  "content_type": "real_content",
88
  "is_public_content": True
89
  })
 
90
  return blocks
91
 
92
  def _get_demo_data(self, url: str, data_type: str) -> Dict:
 
93
  url_type = self._analyze_url_type(url)
 
94
  if 'group' in url_type.lower():
95
  return self._get_group_demo_data(url, data_type)
96
  elif 'page' in url_type.lower():
@@ -99,7 +122,9 @@ class FacebookDataSimulator:
99
  return self._get_general_demo_data(url, data_type)
100
 
101
  def _analyze_url_type(self, url: str) -> str:
 
102
  url_lower = url.lower()
 
103
  if 'group' in url_lower:
104
  return "Facebook Group"
105
  elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
@@ -112,7 +137,9 @@ class FacebookDataSimulator:
112
  return "Facebook Content"
113
 
114
  def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
 
115
  group_name = self._extract_name_from_url(url) or "Gaming Community"
 
116
  return {
117
  "page_info": {
118
  "title": f"{group_name} | Facebook Group",
@@ -124,11 +151,46 @@ class FacebookDataSimulator:
124
  "access_note": "Public group - Limited data due to platform restrictions"
125
  },
126
  "content_blocks": [
127
- {"id": 1, "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.", "length": 120, "word_count": 25, "content_type": "welcome_message", "is_public_content": True},
128
- {"id": 2, "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.", "length": 95, "word_count": 18, "content_type": "member_post", "is_public_content": True},
129
- {"id": 3, "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.", "length": 88, "word_count": 16, "content_type": "question_post", "is_public_content": True},
130
- {"id": 4, "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.", "length": 102, "word_count": 19, "content_type": "event_announcement", "is_public_content": True},
131
- {"id": 5, "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.", "length": 78, "word_count": 14, "content_type": "community_guidelines", "is_public_content": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  ],
133
  "url_type": "Facebook Group",
134
  "extraction_time": datetime.now().isoformat(),
@@ -138,7 +200,9 @@ class FacebookDataSimulator:
138
  }
139
 
140
  def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
 
141
  page_name = self._extract_name_from_url(url) or "Brand Page"
 
142
  return {
143
  "page_info": {
144
  "title": f"{page_name} | Facebook Page",
@@ -150,10 +214,38 @@ class FacebookDataSimulator:
150
  "access_note": "Public page - Limited data due to platform restrictions"
151
  },
152
  "content_blocks": [
153
- {"id": 1, "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.", "length": 98, "word_count": 15, "content_type": "welcome_message", "is_public_content": True},
154
- {"id": 2, "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.", "length": 92, "word_count": 16, "content_type": "announcement", "is_public_content": True},
155
- {"id": 3, "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.", "length": 87, "word_count": 14, "content_type": "event_followup", "is_public_content": True},
156
- {"id": 4, "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.", "length": 85, "word_count": 15, "content_type": "support_info", "is_public_content": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  ],
158
  "url_type": "Facebook Page",
159
  "extraction_time": datetime.now().isoformat(),
@@ -163,6 +255,7 @@ class FacebookDataSimulator:
163
  }
164
 
165
  def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
 
166
  return {
167
  "page_info": {
168
  "title": "Facebook Content",
@@ -173,8 +266,22 @@ class FacebookDataSimulator:
173
  "access_note": "Public content - Platform restrictions apply"
174
  },
175
  "content_blocks": [
176
- {"id": 1, "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.", "length": 105, "word_count": 16, "content_type": "general_content", "is_public_content": True},
177
- {"id": 2, "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.", "length": 82, "word_count": 12, "content_type": "platform_updates", "is_public_content": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  ],
179
  "url_type": "Facebook Content",
180
  "extraction_time": datetime.now().isoformat(),
@@ -184,14 +291,18 @@ class FacebookDataSimulator:
184
  }
185
 
186
  def _extract_name_from_url(self, url: str) -> str:
 
 
187
  match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
188
  if match:
189
  name = match.group(1)
 
190
  name = name.replace('-', ' ').title()
191
  return name
192
  return ""
193
-
194
  def _create_demo_data(self) -> Dict:
 
195
  return {
196
  "groups": {
197
  "gamersofbangladesh2": "Gaming Community Bangladesh",
@@ -205,99 +316,270 @@ class FacebookDataSimulator:
205
  }
206
  }
207
 
208
- # ------------------ Hugging Face AI Integration ------------------
209
-
210
  def get_embeddings():
211
- api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
212
- if not api_key:
213
- st.error("❌ HuggingFace API Key not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  return None
215
 
216
- embeddings = HuggingFaceInstructEmbeddings(
217
- model_name="hkunlp/instructor-mini",
218
- model_kwargs={"device": "cpu"},
219
- huggingfacehub_api_token=api_key
220
- )
221
- st.success("βœ… HuggingFace Embeddings loaded")
222
- return embeddings
223
-
224
  def get_llm():
225
- api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
226
- if not api_key:
227
- st.error("❌ HuggingFace API Key not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  return None
229
-
230
- llm = ChatHuggingFaceHub(
231
- repo_id="google/flan-t5-large",
232
- model_kwargs={"temperature":0.7, "max_new_tokens":512},
233
- huggingfacehub_api_token=api_key
234
- )
235
- st.success("βœ… HuggingFace LLM loaded")
236
- return llm
237
 
238
  def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
 
239
  try:
240
  if not extracted_data:
241
- return "No data available."
242
 
243
  page_info = extracted_data.get('page_info', {})
244
  content_blocks = extracted_data.get('content_blocks', [])
245
  url_type = extracted_data.get('url_type', 'Facebook Content')
246
  source = extracted_data.get('source', 'demo')
 
247
  user_input_lower = user_input.lower()
248
-
 
249
  if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
250
- return f"**πŸ“Š Summary of {page_info.get('title','Facebook Content')}**\nType: {url_type}\nData Source: {source.upper()}\nBlocks: {len(content_blocks)}"
251
- elif any(word in user_input_lower for word in ['purpose','about','what is']):
252
- return f"**🎯 Purpose:** {page_info.get('description','Community engagement and content sharing')}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  else:
254
- return f"**πŸ€– Analysis:** This {url_type.lower()} contains {len(content_blocks)} content blocks."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  except Exception as e:
256
  return f"Analysis error: {str(e)}"
257
 
258
  def process_facebook_data(extracted_data):
 
259
  if not extracted_data or extracted_data.get("status") != "success":
260
  return None, []
261
-
262
- all_text = ""
263
- for block in extracted_data["content_blocks"]:
264
- all_text += block["content"] + "\n\n"
265
-
266
- splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  chunks = splitter.split_text(all_text)
268
  documents = [Document(page_content=chunk) for chunk in chunks]
269
-
270
- embeddings = get_embeddings()
271
- if embeddings is None:
272
- return "simple", documents
273
-
274
- vectorstore = FAISS.from_documents(documents, embeddings)
275
- return vectorstore, documents
276
 
277
  def create_chatbot(vectorstore):
278
- llm = get_llm()
279
- if llm is None:
280
- return "simple"
281
-
282
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
283
- chain = ConversationalRetrievalChain.from_llm(
284
- llm=llm,
285
- retriever=vectorstore.as_retriever(search_kwargs={"k":3}),
286
- memory=memory,
287
- return_source_documents=True,
288
- output_key="answer"
289
- )
290
- return chain
291
-
292
- # ------------------ Streamlit UI ------------------
 
 
 
 
 
 
 
 
293
 
294
  def main():
295
- st.title("πŸ“˜ Facebook Data Extractor (Live Hugging Face)")
296
- st.markdown("**University Project** - Real data when possible, demo data if restricted")
297
 
298
  if st.button("← Back to Main Dashboard"):
299
  st.switch_page("app.py")
300
-
 
301
  if "extractor" not in st.session_state:
302
  st.session_state.extractor = FacebookDataSimulator()
303
  if "facebook_data" not in st.session_state:
@@ -309,69 +591,225 @@ def main():
309
  if "chat_history" not in st.session_state:
310
  st.session_state.chat_history = []
311
  if "processing_mode" not in st.session_state:
312
- st.session_state.processing_mode = "ai"
313
-
314
  # Sidebar
315
  with st.sidebar:
316
  st.header("βš™οΈ Facebook Configuration")
317
- data_type = st.selectbox("Content Type", ["group","page","event","post","general"])
318
- facebook_url = st.text_input("Facebook URL","https://www.facebook.com/groups/gamersofbangladesh2")
319
- processing_mode = st.radio("Analysis Mode:", ["AI Analysis (Recommended)","Simple Analysis"])
320
- st.session_state.processing_mode = "ai" if processing_mode=="AI Analysis (Recommended)" else "simple"
321
-
322
- if st.button("πŸš€ Extract Facebook Data"):
323
- url_to_use = facebook_url
324
- if not url_to_use or 'facebook.com' not in url_to_use:
325
- st.error("❌ Enter a valid Facebook URL")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  else:
327
  with st.spinner("πŸ”„ Analyzing Facebook data..."):
328
  extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
 
329
  if extracted_data.get("status") == "success":
330
  st.session_state.facebook_data = extracted_data
331
- if st.session_state.processing_mode=="ai":
332
- vectorstore, _ = process_facebook_data(extracted_data)
333
- if vectorstore!="simple":
334
- st.session_state.vectorstore = vectorstore
335
- st.session_state.chatbot = create_chatbot(vectorstore)
 
 
 
 
336
  else:
337
- st.warning("⚠️ Using simple analysis")
338
  st.session_state.chatbot = "simple"
 
339
  else:
340
  st.session_state.chatbot = "simple"
341
- st.success("βœ… Data ready!")
 
 
 
 
 
 
 
342
  else:
343
- st.error("❌ Extraction failed")
344
-
345
- # Main columns
346
- col1, col2 = st.columns([1,1])
 
 
 
 
 
 
 
 
 
 
 
347
  with col1:
348
  st.header("πŸ“Š Extraction Results")
 
349
  if st.session_state.facebook_data:
350
  data = st.session_state.facebook_data
351
- page_info = data["page_info"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  st.write(f"**Title:** {page_info['title']}")
353
- st.write(f"**Description:** {page_info.get('description','No description')}")
354
- st.write(f"**Access:** {page_info.get('access_note','Public')}")
355
- st.subheader("Content Blocks")
356
- for i, block in enumerate(data["content_blocks"]):
357
- st.markdown(f"**Block {i+1}:** {block['content']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
  with col2:
360
- st.header("πŸ’¬ Ask About This Data")
361
- if st.session_state.facebook_data:
362
- user_input = st.text_input("Enter your question")
 
 
 
 
 
 
 
 
 
 
 
 
363
  if user_input:
364
- if st.session_state.chatbot=="simple":
365
- answer = simple_chat_analysis(user_input, st.session_state.facebook_data)
366
- st.markdown(answer)
367
- else:
368
- chain = st.session_state.chatbot
369
- result = chain({"question":user_input})
370
- st.markdown(result['answer'])
371
- if result.get("source_documents"):
372
- st.subheader("πŸ“‘ Source Documents")
373
- for doc in result["source_documents"]:
374
- st.markdown(f"- {doc.page_content[:300]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
- if __name__=="__main__":
377
- main()
 
9
  import os
10
  import tempfile
11
 
12
+ # Import your existing AI components
13
+ from langchain_text_splitters import CharacterTextSplitter
14
+ from langchain_community.embeddings import HuggingFaceEmbeddings
15
  from langchain.vectorstores import FAISS
16
  from langchain.memory import ConversationBufferMemory
17
  from langchain.chains import ConversationalRetrievalChain
18
  from langchain.schema import Document
19
+ from langchain_community.llms import HuggingFaceHub
20
 
21
  st.set_page_config(
22
  page_title="Facebook Data Extractor",
 
35
  try:
36
  st.info(f"πŸ” Analyzing: {url}")
37
 
38
+ # Try real extraction first
39
  real_data = self._try_real_extraction(url)
40
  if real_data.get("status") == "success":
41
  return real_data
42
 
43
+ # If real extraction fails, use demo data
44
  st.warning("⚠️ Using demo data (Facebook restrictions active)")
45
  return self._get_demo_data(url, data_type)
46
 
 
49
  return self._get_demo_data(url, data_type)
50
 
51
  def _try_real_extraction(self, url: str) -> Dict:
52
+ """Try real extraction with better error handling"""
53
  try:
54
+ # Use a proxy-like approach with different user agents
55
  headers = {
56
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
57
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
58
+ 'Accept-Language': 'en-US,en;q=0.5',
59
+ 'Accept-Encoding': 'gzip, deflate, br',
60
+ 'DNT': '1',
61
+ 'Connection': 'keep-alive',
62
+ 'Upgrade-Insecure-Requests': '1',
63
  }
64
+
65
+ # Try with shorter timeout
66
  response = requests.get(url, headers=headers, timeout=10, verify=False)
67
+
68
  if response.status_code == 200:
69
  soup = BeautifulSoup(response.text, 'html.parser')
70
+
71
+ # Extract basic info
72
  title = soup.find('title')
73
  description = soup.find('meta', attrs={'name': 'description'})
74
+
75
  return {
76
  "page_info": {
77
  "title": title.text if title else "Facebook Content",
 
88
  }
89
  else:
90
  return {"status": "error", "source": "real"}
91
+
92
  except Exception:
93
  return {"status": "error", "source": "real"}
94
 
95
  def _extract_real_content(self, soup) -> List[Dict]:
96
+ """Extract content from real page"""
97
  blocks = []
98
  text = soup.get_text()
99
  paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
100
+
101
  for i, paragraph in enumerate(paragraphs[:8]):
102
  blocks.append({
103
  "id": i + 1,
 
107
  "content_type": "real_content",
108
  "is_public_content": True
109
  })
110
+
111
  return blocks
112
 
113
  def _get_demo_data(self, url: str, data_type: str) -> Dict:
114
+ """Get realistic demo data based on URL type"""
115
  url_type = self._analyze_url_type(url)
116
+
117
  if 'group' in url_type.lower():
118
  return self._get_group_demo_data(url, data_type)
119
  elif 'page' in url_type.lower():
 
122
  return self._get_general_demo_data(url, data_type)
123
 
124
  def _analyze_url_type(self, url: str) -> str:
125
+ """Analyze URL type for realistic demo data"""
126
  url_lower = url.lower()
127
+
128
  if 'group' in url_lower:
129
  return "Facebook Group"
130
  elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
 
137
  return "Facebook Content"
138
 
139
  def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
140
+ """Get realistic group demo data"""
141
  group_name = self._extract_name_from_url(url) or "Gaming Community"
142
+
143
  return {
144
  "page_info": {
145
  "title": f"{group_name} | Facebook Group",
 
151
  "access_note": "Public group - Limited data due to platform restrictions"
152
  },
153
  "content_blocks": [
154
+ {
155
+ "id": 1,
156
+ "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.",
157
+ "length": 120,
158
+ "word_count": 25,
159
+ "content_type": "welcome_message",
160
+ "is_public_content": True
161
+ },
162
+ {
163
+ "id": 2,
164
+ "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.",
165
+ "length": 95,
166
+ "word_count": 18,
167
+ "content_type": "member_post",
168
+ "is_public_content": True
169
+ },
170
+ {
171
+ "id": 3,
172
+ "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.",
173
+ "length": 88,
174
+ "word_count": 16,
175
+ "content_type": "question_post",
176
+ "is_public_content": True
177
+ },
178
+ {
179
+ "id": 4,
180
+ "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.",
181
+ "length": 102,
182
+ "word_count": 19,
183
+ "content_type": "event_announcement",
184
+ "is_public_content": True
185
+ },
186
+ {
187
+ "id": 5,
188
+ "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.",
189
+ "length": 78,
190
+ "word_count": 14,
191
+ "content_type": "community_guidelines",
192
+ "is_public_content": True
193
+ }
194
  ],
195
  "url_type": "Facebook Group",
196
  "extraction_time": datetime.now().isoformat(),
 
200
  }
201
 
202
  def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
203
+ """Get realistic page demo data"""
204
  page_name = self._extract_name_from_url(url) or "Brand Page"
205
+
206
  return {
207
  "page_info": {
208
  "title": f"{page_name} | Facebook Page",
 
214
  "access_note": "Public page - Limited data due to platform restrictions"
215
  },
216
  "content_blocks": [
217
+ {
218
+ "id": 1,
219
+ "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.",
220
+ "length": 98,
221
+ "word_count": 15,
222
+ "content_type": "welcome_message",
223
+ "is_public_content": True
224
+ },
225
+ {
226
+ "id": 2,
227
+ "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.",
228
+ "length": 92,
229
+ "word_count": 16,
230
+ "content_type": "announcement",
231
+ "is_public_content": True
232
+ },
233
+ {
234
+ "id": 3,
235
+ "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.",
236
+ "length": 87,
237
+ "word_count": 14,
238
+ "content_type": "event_followup",
239
+ "is_public_content": True
240
+ },
241
+ {
242
+ "id": 4,
243
+ "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.",
244
+ "length": 85,
245
+ "word_count": 15,
246
+ "content_type": "support_info",
247
+ "is_public_content": True
248
+ }
249
  ],
250
  "url_type": "Facebook Page",
251
  "extraction_time": datetime.now().isoformat(),
 
255
  }
256
 
257
  def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
258
+ """Get general demo data"""
259
  return {
260
  "page_info": {
261
  "title": "Facebook Content",
 
266
  "access_note": "Public content - Platform restrictions apply"
267
  },
268
  "content_blocks": [
269
+ {
270
+ "id": 1,
271
+ "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.",
272
+ "length": 105,
273
+ "word_count": 16,
274
+ "content_type": "general_content",
275
+ "is_public_content": True
276
+ },
277
+ {
278
+ "id": 2,
279
+ "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.",
280
+ "length": 82,
281
+ "word_count": 12,
282
+ "content_type": "platform_updates",
283
+ "is_public_content": True
284
+ }
285
  ],
286
  "url_type": "Facebook Content",
287
  "extraction_time": datetime.now().isoformat(),
 
291
  }
292
 
293
  def _extract_name_from_url(self, url: str) -> str:
294
+ """Extract name from URL for realistic demo data"""
295
+ # Extract name from URL for more realistic demo data
296
  match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
297
  if match:
298
  name = match.group(1)
299
+ # Clean up the name
300
  name = name.replace('-', ' ').title()
301
  return name
302
  return ""
303
+
304
  def _create_demo_data(self) -> Dict:
305
+ """Create comprehensive demo data"""
306
  return {
307
  "groups": {
308
  "gamersofbangladesh2": "Gaming Community Bangladesh",
 
316
  }
317
  }
318
 
 
 
319
  def get_embeddings():
320
+ """Initialize embeddings with better error handling and cache management"""
321
+ try:
322
+ # Try multiple embedding models with different cache directories
323
+ model_options = [
324
+ "sentence-transformers/all-MiniLM-L6-v2",
325
+ "sentence-transformers/paraphrase-MiniLM-L3-v2",
326
+ "sentence-transformers/all-mpnet-base-v2"
327
+ ]
328
+
329
+ for model_name in model_options:
330
+ try:
331
+ st.info(f"πŸ”„ Trying embedding model: {model_name}")
332
+
333
+ # Use temporary directory for cache to avoid permission issues
334
+ with tempfile.TemporaryDirectory() as temp_cache:
335
+ embeddings = HuggingFaceEmbeddings(
336
+ model_name=model_name,
337
+ cache_folder=temp_cache,
338
+ model_kwargs={'device': 'cpu'}
339
+ )
340
+
341
+ # Test the embeddings
342
+ test_text = "Hello world"
343
+ test_embedding = embeddings.embed_query(test_text)
344
+ if test_embedding and len(test_embedding) > 0:
345
+ st.success(f"βœ… Loaded embeddings: {model_name.split('/')[-1]}")
346
+ return embeddings
347
+
348
+ except Exception as e:
349
+ st.warning(f"⚠️ Failed to load {model_name}: {str(e)}")
350
+ continue
351
+
352
+ # If all models fail, try without cache
353
+ st.warning("πŸ”„ Trying fallback embedding method...")
354
+ try:
355
+ embeddings = HuggingFaceEmbeddings(
356
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
357
+ )
358
+ st.success("βœ… Loaded fallback embeddings")
359
+ return embeddings
360
+ except Exception as e:
361
+ st.error(f"❌ All embedding models failed: {e}")
362
+ return None
363
+
364
+ except Exception as e:
365
+ st.error(f"❌ Embeddings error: {e}")
366
  return None
367
 
 
 
 
 
 
 
 
 
368
  def get_llm():
369
+ """Initialize HuggingFace LLM"""
370
+ try:
371
+ api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
372
+ if not api_key:
373
+ st.error("HuggingFace API Key not found")
374
+ return None
375
+
376
+ # Try multiple models
377
+ model_options = [
378
+ "mistralai/Mistral-7B-Instruct-v0.1",
379
+ "google/flan-t5-large",
380
+ "microsoft/DialoGPT-large"
381
+ ]
382
+
383
+ for model_id in model_options:
384
+ try:
385
+ st.info(f"πŸ”„ Trying LLM: {model_id}")
386
+
387
+ llm = HuggingFaceHub(
388
+ repo_id=model_id,
389
+ huggingfacehub_api_token=api_key,
390
+ model_kwargs={
391
+ "temperature": 0.7,
392
+ "max_length": 512,
393
+ "max_new_tokens": 256,
394
+ }
395
+ )
396
+
397
+ # Test the model
398
+ test_response = llm.invoke("Hello")
399
+ if test_response and len(test_response.strip()) > 0:
400
+ st.success(f"βœ… Loaded LLM: {model_id.split('/')[-1]}")
401
+ return llm
402
+
403
+ except Exception as e:
404
+ st.warning(f"⚠️ Failed to load {model_id}: {str(e)}")
405
+ continue
406
+
407
+ st.error("❌ All LLMs failed to load")
408
+ return None
409
+
410
+ except Exception as e:
411
+ st.error(f"❌ LLM error: {e}")
412
  return None
 
 
 
 
 
 
 
 
413
 
414
  def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
415
+ """Simple rule-based chat analysis when embeddings fail"""
416
  try:
417
  if not extracted_data:
418
+ return "No data available for analysis."
419
 
420
  page_info = extracted_data.get('page_info', {})
421
  content_blocks = extracted_data.get('content_blocks', [])
422
  url_type = extracted_data.get('url_type', 'Facebook Content')
423
  source = extracted_data.get('source', 'demo')
424
+
425
  user_input_lower = user_input.lower()
426
+
427
+ # Basic analysis based on input
428
  if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
429
+ response_lines = [
430
+ f"**πŸ“Š Summary of {page_info.get('title', 'Facebook Content')}**",
431
+ "",
432
+ f"**Type:** {url_type}",
433
+ f"**Data Source:** {source.upper()}",
434
+ f"**Description:** {page_info.get('description', 'No description available')}",
435
+ "",
436
+ f"This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks of public information.",
437
+ "",
438
+ "**Key Content Types:**",
439
+ f"{', '.join(set(block['content_type'] for block in content_blocks))}",
440
+ "",
441
+ "The content focuses on community engagement and social interactions."
442
+ ]
443
+ return "\n".join(response_lines)
444
+
445
+ elif any(word in user_input_lower for word in ['purpose', 'about', 'what is']):
446
+ community_posts = len([b for b in content_blocks if 'community' in b['content_type'].lower()])
447
+ announcement_posts = len([b for b in content_blocks if 'announcement' in b['content_type'].lower()])
448
+ member_posts = len([b for b in content_blocks if 'post' in b['content_type'].lower()])
449
+
450
+ response_lines = [
451
+ "**🎯 Purpose Analysis**",
452
+ "",
453
+ f"Based on the extracted data, this {url_type.lower()} appears to be focused on:",
454
+ "",
455
+ f"- **Community Building:** {community_posts} community-related posts",
456
+ f"- **Information Sharing:** {announcement_posts} announcements",
457
+ f"- **Member Engagement:** {member_posts} member posts",
458
+ "",
459
+ f"**Overall Purpose:** {page_info.get('description', 'Community engagement and content sharing')}"
460
+ ]
461
+ return "\n".join(response_lines)
462
+
463
+ elif any(word in user_input_lower for word in ['activity', 'engagement', 'active']):
464
+ active_blocks = len([b for b in content_blocks if any(word in b['content_type'].lower() for word in ['post', 'question', 'event'])])
465
+ info_blocks = len(content_blocks) - active_blocks
466
+
467
+ response_lines = [
468
+ "**πŸ“ˆ Activity Analysis**",
469
+ "",
470
+ "**Content Activity Level:**",
471
+ f"- Total Content Blocks: {len(content_blocks)}",
472
+ f"- Active Engagement Posts: {active_blocks}",
473
+ f"- Informational Posts: {info_blocks}",
474
+ "",
475
+ f"The {url_type.lower()} shows a good mix of member engagement and informational content, suggesting an active community."
476
+ ]
477
+ return "\n".join(response_lines)
478
+
479
  else:
480
+ response_lines = [
481
+ "**πŸ€– Analysis Response**",
482
+ "",
483
+ f"I've analyzed the {url_type.lower()} data for you.",
484
+ "",
485
+ f"**Your question:** \"{user_input}\"",
486
+ f"**Content Source:** {source.upper()} data",
487
+ f"**Content Type:** {url_type}",
488
+ "",
489
+ f"This {url_type.lower()} contains {len(content_blocks)} pieces of content focusing on community engagement and information sharing.",
490
+ "",
491
+ "**Try asking:**",
492
+ "- \"What is the main purpose of this group/page?\"",
493
+ "- \"Summarize the content and activities\"",
494
+ "- \"What kind of engagement does this content show?\""
495
+ ]
496
+ return "\n".join(response_lines)
497
+
498
  except Exception as e:
499
  return f"Analysis error: {str(e)}"
500
 
501
  def process_facebook_data(extracted_data):
502
+ """Process extracted data for AI analysis with fallbacks"""
503
  if not extracted_data or extracted_data.get("status") != "success":
504
  return None, []
505
+
506
+ page_info = extracted_data['page_info']
507
+ content_blocks = extracted_data['content_blocks']
508
+ url_type = extracted_data['url_type']
509
+ source = extracted_data.get('source', 'unknown')
510
+
511
+ all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
512
+ all_text += f"πŸ“„ PAGE INFORMATION:\n"
513
+ all_text += f"Title: {page_info['title']}\n"
514
+ all_text += f"URL Type: {url_type}\n"
515
+ all_text += f"Data Source: {source.upper()}\n"
516
+ all_text += f"Access: {page_info.get('access_note', 'Public content')}\n"
517
+
518
+ if page_info.get('member_count'):
519
+ all_text += f"Members: {page_info['member_count']}\n"
520
+ elif page_info.get('follower_count'):
521
+ all_text += f"Followers: {page_info['follower_count']}\n"
522
+
523
+ all_text += f"Extracted: {extracted_data['extraction_time']}\n\n"
524
+
525
+ all_text += f"πŸ“Š CONTENT ANALYSIS:\n"
526
+ all_text += f"Content Blocks: {len(content_blocks)}\n"
527
+ all_text += f"Public Content: {sum(1 for b in content_blocks if b['is_public_content'])} blocks\n\n"
528
+
529
+ for i, block in enumerate(content_blocks):
530
+ all_text += f"--- BLOCK {i+1} ---\n"
531
+ all_text += f"Type: {block['content_type']}\n"
532
+ all_text += f"Words: {block['word_count']} | Public: {block['is_public_content']}\n"
533
+ all_text += f"Content: {block['content']}\n\n"
534
+
535
+ all_text += "="*50
536
+
537
+ # Split into chunks
538
+ splitter = CharacterTextSplitter(
539
+ separator="\n",
540
+ chunk_size=1000,
541
+ chunk_overlap=200,
542
+ length_function=len
543
+ )
544
+
545
  chunks = splitter.split_text(all_text)
546
  documents = [Document(page_content=chunk) for chunk in chunks]
547
+
548
+ return "simple", documents # Return simple mode instead of vectorstore
 
 
 
 
 
549
 
550
  def create_chatbot(vectorstore):
551
+ """Create conversational chatbot"""
552
+ try:
553
+ llm = get_llm()
554
+ if llm is None:
555
+ return "simple" # Return simple mode if LLM fails
556
+
557
+ memory = ConversationBufferMemory(
558
+ memory_key="chat_history",
559
+ return_messages=True,
560
+ output_key="answer"
561
+ )
562
+
563
+ chain = ConversationalRetrievalChain.from_llm(
564
+ llm=llm,
565
+ retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
566
+ memory=memory,
567
+ return_source_documents=True,
568
+ output_key="answer"
569
+ )
570
+ return chain
571
+ except Exception as e:
572
+ st.error(f"Chatbot creation failed: {str(e)}")
573
+ return "simple" # Fallback to simple mode
574
 
575
  def main():
576
+ st.title("πŸ“˜ Facebook Data Extractor")
577
+ st.markdown("**University Project** - Real data when possible, realistic demo data when restricted")
578
 
579
  if st.button("← Back to Main Dashboard"):
580
  st.switch_page("app.py")
581
+
582
+ # Initialize session state
583
  if "extractor" not in st.session_state:
584
  st.session_state.extractor = FacebookDataSimulator()
585
  if "facebook_data" not in st.session_state:
 
591
  if "chat_history" not in st.session_state:
592
  st.session_state.chat_history = []
593
  if "processing_mode" not in st.session_state:
594
+ st.session_state.processing_mode = "ai" # ai or simple
595
+
596
  # Sidebar
597
  with st.sidebar:
598
  st.header("βš™οΈ Facebook Configuration")
599
+
600
+ data_type = st.selectbox(
601
+ "Content Type",
602
+ ["group", "page", "event", "post", "general"],
603
+ help="Select the type of Facebook content"
604
+ )
605
+
606
+ facebook_url = st.text_input(
607
+ "Facebook URL",
608
+ placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
609
+ help="Enter any Facebook URL for analysis"
610
+ )
611
+
612
+ # Processing mode
613
+ st.subheader("πŸ”§ Processing Mode")
614
+ processing_mode = st.radio(
615
+ "Choose analysis mode:",
616
+ ["AI Analysis (Recommended)", "Simple Analysis"],
617
+ help="AI Analysis uses embeddings, Simple uses rule-based"
618
+ )
619
+
620
+ st.session_state.processing_mode = "ai" if processing_mode == "AI Analysis (Recommended)" else "simple"
621
+
622
+ # Quick test URLs
623
+ st.markdown("### πŸš€ Test URLs")
624
+ test_urls = {
625
+ "Gaming Group": "https://www.facebook.com/groups/gamersofbangladesh2",
626
+ "Tech Community": "https://www.facebook.com/groups/programmingcommunity",
627
+ "Business Page": "https://www.facebook.com/Meta/",
628
+ }
629
+
630
+ for name, url in test_urls.items():
631
+ if st.button(f"πŸ”— {name}", key=f"fb_{name}"):
632
+ st.session_state.current_fb_url = url
633
+ st.rerun()
634
+
635
+ if st.button("πŸš€ Extract Facebook Data", type="primary"):
636
+ url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
637
+
638
+ if not url_to_use:
639
+ st.error("❌ Please enter a Facebook URL")
640
+ elif 'facebook.com' not in url_to_use:
641
+ st.error("❌ Please enter a valid Facebook URL")
642
  else:
643
  with st.spinner("πŸ”„ Analyzing Facebook data..."):
644
  extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
645
+
646
  if extracted_data.get("status") == "success":
647
  st.session_state.facebook_data = extracted_data
648
+
649
+ # Process based on selected mode
650
+ if st.session_state.processing_mode == "ai":
651
+ result = process_facebook_data(extracted_data)
652
+ if result and result[0] != "simple":
653
+ st.session_state.vectorstore = result[0]
654
+ st.session_state.chatbot = create_chatbot(result[0])
655
+ st.session_state.chat_history = []
656
+ st.success("βœ… AI analysis ready!")
657
  else:
658
+ st.warning("⚠️ Using simple analysis (AI features limited)")
659
  st.session_state.chatbot = "simple"
660
+ st.session_state.chat_history = []
661
  else:
662
  st.session_state.chatbot = "simple"
663
+ st.session_state.chat_history = []
664
+ st.success("βœ… Simple analysis ready!")
665
+
666
+ source = extracted_data.get('source', 'unknown')
667
+ if source == 'demo':
668
+ st.warning("πŸ“ Using realistic demo data (Facebook restrictions active)")
669
+ else:
670
+ st.success("βœ… Real data extracted successfully!")
671
  else:
672
+ error_msg = extracted_data.get("error", "Unknown error")
673
+ st.error(f"❌ Extraction failed: {error_msg}")
674
+
675
+ if st.session_state.facebook_data:
676
+ st.markdown("---")
677
+ if st.button("πŸ—‘οΈ Clear Data", type="secondary"):
678
+ st.session_state.facebook_data = None
679
+ st.session_state.vectorstore = None
680
+ st.session_state.chatbot = None
681
+ st.session_state.chat_history = []
682
+ st.rerun()
683
+
684
+ # Main content
685
+ col1, col2 = st.columns([1, 1])
686
+
687
  with col1:
688
  st.header("πŸ“Š Extraction Results")
689
+
690
  if st.session_state.facebook_data:
691
  data = st.session_state.facebook_data
692
+ page_info = data['page_info']
693
+ content_blocks = data['content_blocks']
694
+ source = data.get('source', 'unknown')
695
+
696
+ if source == 'demo':
697
+ st.warning("πŸ“ **Demo Data** - Realistic simulation (Facebook restrictions)")
698
+ else:
699
+ st.success("βœ… **Real Data** - Successfully extracted")
700
+
701
+ # Show processing mode
702
+ if st.session_state.processing_mode == "simple":
703
+ st.info("πŸ”§ **Simple Analysis Mode** - Rule-based processing")
704
+ else:
705
+ st.info("πŸ€– **AI Analysis Mode** - Embedding-based processing")
706
+
707
+ # Metrics
708
+ col1, col2, col3 = st.columns(3)
709
+ with col1:
710
+ st.metric("Content Blocks", len(content_blocks))
711
+ with col2:
712
+ st.metric("Data Source", source.upper())
713
+ with col3:
714
+ st.metric("Analysis Mode", "AI" if st.session_state.processing_mode == "ai" else "Simple")
715
+
716
+ # Page info
717
+ st.subheader("🏷️ Page Information")
718
  st.write(f"**Title:** {page_info['title']}")
719
+ st.write(f"**URL Type:** {data['url_type']}")
720
+ st.write(f"**Description:** {page_info.get('description', 'No description')}")
721
+
722
+ if page_info.get('member_count'):
723
+ st.write(f"**Members:** {page_info['member_count']}")
724
+ elif page_info.get('follower_count'):
725
+ st.write(f"**Followers:** {page_info['follower_count']}")
726
+
727
+ st.write(f"**Access:** {page_info.get('access_note', 'Public content')}")
728
+
729
+ # Content samples
730
+ st.subheader("πŸ“ Content Analysis")
731
+ for i, block in enumerate(content_blocks):
732
+ with st.expander(f"Content {i+1} - {block['content_type']} ({block['word_count']} words)"):
733
+ st.write(block['content'])
734
+ st.caption(f"Public: {block['is_public_content']}")
735
+
736
+ else:
737
+ st.info("""
738
+ ## πŸ“˜ Facebook Data Extractor
739
+
740
+ **University Project Feature**
741
+
742
+ **How it works:**
743
+ 1. Enter any Facebook URL
744
+ 2. System tries real data extraction
745
+ 3. If blocked, uses **realistic demo data**
746
+ 4. Choose between AI or Simple analysis
747
+
748
+ **Analysis Modes:**
749
+ - πŸ€– **AI Analysis**: Uses embeddings and Mistral AI
750
+ - πŸ”§ **Simple Analysis**: Rule-based (works without embeddings)
751
+
752
+ **Perfect for demonstrating:**
753
+ - Social media data extraction concepts
754
+ - AI analysis capabilities
755
+ - Platform integration
756
+ - Error handling strategies
757
+ """)
758
 
759
  with col2:
760
+ st.header("πŸ’¬ Analysis Chat")
761
+
762
+ if st.session_state.chatbot and st.session_state.facebook_data:
763
+ # Display chat history
764
+ for chat in st.session_state.chat_history:
765
+ if chat["role"] == "user":
766
+ with st.chat_message("user"):
767
+ st.write(chat['content'])
768
+ elif chat["role"] == "assistant":
769
+ with st.chat_message("assistant"):
770
+ st.write(chat['content'])
771
+
772
+ # Chat input
773
+ user_input = st.chat_input("Ask about the Facebook data...")
774
+
775
  if user_input:
776
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
777
+
778
+ with st.spinner("πŸ€” Analyzing..."):
779
+ try:
780
+ if st.session_state.chatbot == "simple":
781
+ # Use simple analysis
782
+ response = simple_chat_analysis(user_input, st.session_state.facebook_data)
783
+ st.session_state.chat_history.append({"role": "assistant", "content": response})
784
+ else:
785
+ # Use AI chatbot
786
+ response = st.session_state.chatbot.invoke({"question": user_input})
787
+ answer = response.get("answer", "I couldn't generate a response.")
788
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
789
+ st.rerun()
790
+ except Exception as e:
791
+ error_msg = f"Analysis Error: {str(e)}"
792
+ st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
793
+ st.rerun()
794
+
795
+ # Suggested questions
796
+ if not st.session_state.chat_history:
797
+ st.subheader("πŸ’‘ Try asking:")
798
+ suggestions = [
799
+ "What is this Facebook group/page about?",
800
+ "Summarize the main content and purpose",
801
+ "What kind of community is this?",
802
+ "Analyze the engagement and activity level"
803
+ ]
804
+
805
+ for suggestion in suggestions:
806
+ if st.button(suggestion, key=f"fb_suggest_{suggestion}"):
807
+ st.info(f"Type: '{suggestion}' in chat")
808
+
809
+ elif st.session_state.facebook_data:
810
+ st.info("πŸ’¬ Start chatting about the Facebook data")
811
+ else:
812
+ st.info("πŸ” Extract Facebook data to enable analysis")
813
 
814
+ if __name__ == "__main__":
815
+ main()