ilsa15 commited on
Commit
e391969
Β·
verified Β·
1 Parent(s): b5f01aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -218
app.py CHANGED
@@ -493,232 +493,160 @@
493
  # main()
494
 
495
 
 
496
 
497
- # import nest_asyncio
498
- # import streamlit as st
499
- # import os
500
- # import requests
501
- # from youtube_transcript_api import YouTubeTranscriptApi
502
- # from groq import Groq
503
- # from bs4 import BeautifulSoup
504
- # from sentence_transformers import SentenceTransformer
505
- # import chromadb
506
- # from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
507
-
508
- # nest_asyncio.apply()
509
-
510
- # # --- CONFIGURATION ---
511
- # YOUTUBE_API_KEY = os.environ.get("YOUTUBE_API_KEY")
512
- # GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
513
- # channel_id="UCsv3kmQ5k1eIRG2R9mWN-QA" #channelId
514
-
515
- # BASE_URL = "https://icode.guru"
516
-
517
- # groq_client = Groq(api_key=GROQ_API_KEY)
518
- # embedding_function = SentenceTransformerEmbeddingFunction("all-MiniLM-L6-v2")
519
-
520
- # chroma_client = chromadb.Client()
521
- # collection = chroma_client.get_or_create_collection("icodeguru_knowledge", embedding_function=embedding_function)
522
-
523
- # # --- Search persistent vector DB ---
524
- # def search_vector_data(query):
525
- # results = collection.query(query_texts=[query], n_results=3)
526
- # if results and results["documents"]:
527
- # return "\n\n".join([doc for doc in results["documents"][0]])
528
- # return None
529
-
530
- # # --- Fetch recent video IDs from YouTube channel ---
531
- # def get_latest_video_ids(channel_id, max_results=5):
532
- # url = f"https://www.googleapis.com/youtube/v3/search?key={YOUTUBE_API_KEY}&channelId={channel_id}&part=snippet,id&order=date&maxResults={max_results}"
533
- # response = requests.get(url)
534
- # videos = response.json().get('items', [])
535
-
536
- # valid_videos = []
537
- # for v in videos:
538
- # if v['id']['kind'] == 'youtube#video':
539
- # title = v['snippet']['title']
540
- # channel_title = v['snippet']['channelTitle']
541
- # video_id = v['id']['videoId']
542
- # if "icodeguru" in channel_title.lower():
543
- # valid_videos.append((video_id, title))
544
- # return valid_videos
545
-
546
- # # --- Get video transcripts ---
547
- # def get_video_transcripts(video_info):
548
- # results = []
549
- # for vid, title in video_info:
550
- # try:
551
- # transcript = YouTubeTranscriptApi.get_transcript(vid)
552
- # text = " ".join([t['text'] for t in transcript])
553
- # video_link = f"https://www.youtube.com/watch?v={vid}"
554
- # results.append({
555
- # "video_id": vid,
556
- # "title": title,
557
- # "link": video_link,
558
- # "transcript": text
559
- # })
560
- # except:
561
- # continue
562
- # return results
563
-
564
- # # --- Scrape icode.guru ---
565
- # def scrape_icodeguru(base_url=BASE_URL, max_pages=5):
566
- # visited = set()
567
- # blocks = []
568
-
569
- # def crawl(url):
570
- # if url in visited or len(visited) >= max_pages:
571
- # return
572
- # visited.add(url)
573
- # try:
574
- # res = requests.get(url, timeout=10)
575
- # soup = BeautifulSoup(res.content, "html.parser")
576
- # page_text = soup.get_text(separator=" ", strip=True)
577
- # if len(page_text) > 100:
578
- # blocks.append(f"[{url}]({url}):\n{page_text[:1500]}")
579
- # for link in soup.find_all("a", href=True):
580
- # href = link['href']
581
- # if href.startswith("/"):
582
- # href = base_url + href
583
- # if href.startswith(base_url):
584
- # crawl(href)
585
- # except:
586
- # pass
587
-
588
- # crawl(base_url)
589
- # return blocks
590
-
591
- # # --- Ask Groq ---
592
- # def ask_groq(context, question):
593
- # messages = [
594
- # {"role": "system", "content": "You are a helpful assistant. Always provide relevant video and website links if possible."},
595
- # {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}\nAnswer (include links):"}
596
- # ]
597
- # chat_completion = groq_client.chat.completions.create(
598
- # model="llama3-8b-8192",
599
- # messages=messages,
600
- # )
601
- # return chat_completion.choices[0].message.content.strip()
602
-
603
- # #--- STREAMLIT APP ---
604
- # def main():
605
- # st.set_page_config(page_title="EduBot for iCodeGuru", layout="wide")
606
- # st.title("πŸŽ“ EduBot for @icodeguru0")
607
- # st.markdown("Ask anything based on the latest YouTube videos and website content of [icode.guru](https://icode.guru).")
608
-
609
- # user_question = st.text_input("πŸ’¬ Ask your question:")
610
-
611
- # if user_question:
612
- # # Try vector DB first
613
- # vector_context = search_vector_data(user_question)
614
- # if vector_context:
615
- # with st.spinner("🧠 Answering from knowledge base..."):
616
- # answer = ask_groq(vector_context, user_question)
617
- # st.success(answer)
618
- # else:
619
- # # Fallback to real-time data
620
- # with st.spinner("πŸ“Ί Fetching YouTube videos..."):
621
- # video_info = get_latest_video_ids(channel_id, max_results=5)
622
- # transcripts = get_video_transcripts(video_info)
623
-
624
- # yt_context = ""
625
- # relevant_links = []
626
- # for vid in transcripts:
627
- # yt_context += f"\n\n[Video: {vid['title']}]({vid['link']}):\n{vid['transcript'][:1500]}"
628
- # if user_question.lower() in vid['transcript'].lower():
629
- # relevant_links.append(vid['link'])
630
-
631
- # with st.spinner("🌐 Scraping icode.guru..."):
632
- # site_blocks = scrape_icodeguru(BASE_URL, max_pages=5)
633
- # site_context = "\n\n".join(site_blocks)
634
-
635
- # full_context = yt_context + "\n\n" + site_context
636
-
637
- # with st.spinner("🧠 Thinking..."):
638
- # answer = ask_groq(full_context, user_question)
639
- # st.success(answer)
640
-
641
- # if relevant_links:
642
- # st.markdown("### πŸ”— Related YouTube Links")
643
- # for link in relevant_links:
644
- # st.markdown(f"- [Watch Video]({link})")
645
-
646
- # st.markdown("---")
647
- # st.caption("Powered by YouTube, iCodeGuru, and Groq")
648
-
649
- # if __name__ == "__main__":
650
- # main()
651
 
 
652
  import streamlit as st
653
  import os
654
- import json
655
- from langchain.text_splitter import RecursiveCharacterTextSplitter
656
- from langchain.embeddings import HuggingFaceEmbeddings
657
- import chromadb
658
- from chromadb.config import Settings
659
- from langchain.vectorstores import Chroma
660
  from groq import Groq
 
 
 
 
661
 
662
- # ---- Config ----
663
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
664
- MODEL_NAME = "mixtral-8x7b-32768"
665
- DATA_PATH = "data" # local folder with all files from GitHub repo
666
-
667
- # ---- Setup ----
668
- groq_client = Groq(api_key=GROQ_API_KEY)
669
- embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
670
- chroma_client = chromadb.Client(Settings(persist_directory="chromadb_store", anonymized_telemetry=False))
671
-
672
- # ---- Load and Embed ----
673
- @st.cache_resource
674
- def load_vector_db():
675
- docs = []
676
-
677
- for fname in os.listdir(DATA_PATH):
678
- fpath = os.path.join(DATA_PATH, fname)
679
- if fname.endswith(".txt"):
680
- with open(fpath, 'r', encoding='utf-8') as f:
681
- text = f.read()
682
- elif fname.endswith(".json"):
683
- with open(fpath, 'r', encoding='utf-8') as f:
684
- content = json.load(f)
685
- text = json.dumps(content)
686
- else:
687
- continue
688
-
689
- splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
690
- docs.extend(splitter.create_documents([text]))
691
 
692
- # Save to ChromaDB
693
- vectordb = Chroma.from_documents(
694
- documents=docs,
695
- embedding=embed_model,
696
- persist_directory="chromadb_store"
697
- )
698
- vectordb.persist()
699
- return vectordb
700
 
701
- db = load_vector_db()
702
 
703
- # ---- RAG QA ----
704
- def answer_with_rag(query):
705
- docs = db.similarity_search(query, k=3)
706
- if not docs:
707
- return "⚠️ No relevant answer found in embedded knowledge."
708
- context = "\n".join([doc.page_content for doc in docs])
709
-
710
- prompt = f"Answer the following using only the provided context:\n\nContext:\n{context}\n\nQuestion: {query}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  chat_completion = groq_client.chat.completions.create(
712
- messages=[{"role": "user", "content": prompt}],
713
- model=MODEL_NAME,
714
  )
715
- return chat_completion.choices[0].message.content
716
-
717
- # ---- Streamlit UI ----
718
- st.title("πŸ“š iCodeGuru ChatBot (RAG + Chroma + Groq)")
719
-
720
- user_query = st.text_input("Ask me something about iCodeGuru:")
721
- if user_query:
722
- with st.spinner("Thinking..."):
723
- response = answer_with_rag(user_query)
724
- st.success(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  # main()
494
 
495
 
496
+ #(only stored data)
497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
+ import nest_asyncio
500
  import streamlit as st
501
  import os
502
+ import requests
503
+ from youtube_transcript_api import YouTubeTranscriptApi
 
 
 
 
504
  from groq import Groq
505
+ from bs4 import BeautifulSoup
506
+ from sentence_transformers import SentenceTransformer
507
+ import chromadb
508
+ from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
509
 
510
+ nest_asyncio.apply()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
 
512
+ # --- CONFIGURATION ---
513
+ YOUTUBE_API_KEY = os.environ.get("YOUTUBE_API_KEY")
514
+ GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
515
+ channel_id="UCsv3kmQ5k1eIRG2R9mWN-QA" #channelId
 
 
 
 
516
 
517
+ BASE_URL = "https://icode.guru"
518
 
519
+ groq_client = Groq(api_key=GROQ_API_KEY)
520
+ embedding_function = SentenceTransformerEmbeddingFunction("all-MiniLM-L6-v2")
521
+
522
+ chroma_client = chromadb.Client()
523
+ collection = chroma_client.get_or_create_collection("icodeguru_knowledge", embedding_function=embedding_function)
524
+
525
+ # --- Search persistent vector DB ---
526
+ def search_vector_data(query):
527
+ results = collection.query(query_texts=[query], n_results=3)
528
+ if results and results["documents"]:
529
+ return "\n\n".join([doc for doc in results["documents"][0]])
530
+ return None
531
+
532
+ # --- Fetch recent video IDs from YouTube channel ---
533
+ def get_latest_video_ids(channel_id, max_results=5):
534
+ url = f"https://www.googleapis.com/youtube/v3/search?key={YOUTUBE_API_KEY}&channelId={channel_id}&part=snippet,id&order=date&maxResults={max_results}"
535
+ response = requests.get(url)
536
+ videos = response.json().get('items', [])
537
+
538
+ valid_videos = []
539
+ for v in videos:
540
+ if v['id']['kind'] == 'youtube#video':
541
+ title = v['snippet']['title']
542
+ channel_title = v['snippet']['channelTitle']
543
+ video_id = v['id']['videoId']
544
+ if "icodeguru" in channel_title.lower():
545
+ valid_videos.append((video_id, title))
546
+ return valid_videos
547
+
548
+ # --- Get video transcripts ---
549
+ def get_video_transcripts(video_info):
550
+ results = []
551
+ for vid, title in video_info:
552
+ try:
553
+ transcript = YouTubeTranscriptApi.get_transcript(vid)
554
+ text = " ".join([t['text'] for t in transcript])
555
+ video_link = f"https://www.youtube.com/watch?v={vid}"
556
+ results.append({
557
+ "video_id": vid,
558
+ "title": title,
559
+ "link": video_link,
560
+ "transcript": text
561
+ })
562
+ except:
563
+ continue
564
+ return results
565
+
566
+ # --- Scrape icode.guru ---
567
+ def scrape_icodeguru(base_url=BASE_URL, max_pages=5):
568
+ visited = set()
569
+ blocks = []
570
+
571
+ def crawl(url):
572
+ if url in visited or len(visited) >= max_pages:
573
+ return
574
+ visited.add(url)
575
+ try:
576
+ res = requests.get(url, timeout=10)
577
+ soup = BeautifulSoup(res.content, "html.parser")
578
+ page_text = soup.get_text(separator=" ", strip=True)
579
+ if len(page_text) > 100:
580
+ blocks.append(f"[{url}]({url}):\n{page_text[:1500]}")
581
+ for link in soup.find_all("a", href=True):
582
+ href = link['href']
583
+ if href.startswith("/"):
584
+ href = base_url + href
585
+ if href.startswith(base_url):
586
+ crawl(href)
587
+ except:
588
+ pass
589
+
590
+ crawl(base_url)
591
+ return blocks
592
+
593
+ # --- Ask Groq ---
594
+ def ask_groq(context, question):
595
+ messages = [
596
+ {"role": "system", "content": "You are a helpful assistant. Always provide relevant video and website links if possible."},
597
+ {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}\nAnswer (include links):"}
598
+ ]
599
  chat_completion = groq_client.chat.completions.create(
600
+ model="llama3-8b-8192",
601
+ messages=messages,
602
  )
603
+ return chat_completion.choices[0].message.content.strip()
604
+
605
+ #--- STREAMLIT APP ---
606
+ def main():
607
+ st.set_page_config(page_title="EduBot for iCodeGuru", layout="wide")
608
+ st.title("πŸŽ“ EduBot for @icodeguru0")
609
+ st.markdown("Ask anything based on the latest YouTube videos and website content of [icode.guru](https://icode.guru).")
610
+
611
+ user_question = st.text_input("πŸ’¬ Ask your question:")
612
+
613
+ if user_question:
614
+ # Try vector DB first
615
+ vector_context = search_vector_data(user_question)
616
+ if vector_context:
617
+ with st.spinner("🧠 Answering from knowledge base..."):
618
+ answer = ask_groq(vector_context, user_question)
619
+ st.success(answer)
620
+ else:
621
+ # Fallback to real-time data
622
+ with st.spinner("πŸ“Ί Fetching YouTube videos..."):
623
+ video_info = get_latest_video_ids(channel_id, max_results=5)
624
+ transcripts = get_video_transcripts(video_info)
625
+
626
+ yt_context = ""
627
+ relevant_links = []
628
+ for vid in transcripts:
629
+ yt_context += f"\n\n[Video: {vid['title']}]({vid['link']}):\n{vid['transcript'][:1500]}"
630
+ if user_question.lower() in vid['transcript'].lower():
631
+ relevant_links.append(vid['link'])
632
+
633
+ with st.spinner("🌐 Scraping icode.guru..."):
634
+ site_blocks = scrape_icodeguru(BASE_URL, max_pages=5)
635
+ site_context = "\n\n".join(site_blocks)
636
+
637
+ full_context = yt_context + "\n\n" + site_context
638
+
639
+ with st.spinner("🧠 Thinking..."):
640
+ answer = ask_groq(full_context, user_question)
641
+ st.success(answer)
642
+
643
+ if relevant_links:
644
+ st.markdown("### πŸ”— Related YouTube Links")
645
+ for link in relevant_links:
646
+ st.markdown(f"- [Watch Video]({link})")
647
+
648
+ st.markdown("---")
649
+ st.caption("Powered by YouTube, iCodeGuru, and Groq")
650
+
651
+ if __name__ == "__main__":
652
+ main()