SurajJha21 commited on
Commit
4fc8401
·
verified ·
1 Parent(s): 6d29f58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -70
app.py CHANGED
@@ -1,84 +1,64 @@
 
 
 
 
1
  import streamlit as st
2
- from langchain_groq import ChatGroq
3
- from langchain_community.document_loaders import WebBaseLoader
4
- from langchain.embeddings import HuggingFaceEmbeddings
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain.chains.combine_documents import create_stuff_documents_chain
7
- from langchain_core.prompts import ChatPromptTemplate
8
- from langchain.chains import create_retrieval_chain
9
- from langchain_community.vectorstores import FAISS
10
- from transformers import AutoTokenizer, AutoModel
11
- import torch
12
- import numpy as np
13
- import time
14
 
15
- # Load tokenizer and model for embeddings
16
- try:
17
- tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
18
- model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
19
- except ImportError as e:
20
- st.error(f"ImportError: {e}. Make sure you have the transformers library installed.")
21
- except Exception as e:
22
- st.error(f"Failed to load tokenizer or model: {e}")
23
 
24
- class CustomHuggingFaceEmbeddings(HuggingFaceEmbeddings):
25
- def __init__(self):
26
- super().__init__(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
27
 
28
- def embed_documents(self, texts):
29
- inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
30
- with torch.no_grad():
31
- embeddings = model(**inputs).last_hidden_state.mean(dim=1)
32
- return embeddings.numpy()
 
 
 
33
 
34
- # Instantiate embeddings class
35
- embeddings = CustomHuggingFaceEmbeddings()
36
 
37
- if "vector" not in st.session_state:
38
- st.session_state.loader = WebBaseLoader("https://docs.nvidia.com/cuda/")
39
- st.session_state.docs = st.session_state.loader.load()
 
 
 
 
 
40
 
41
- st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
42
- documents = st.session_state.text_splitter.split_documents(st.session_state.docs[:50])
43
 
44
- # Create FAISS index using the custom embeddings class
45
- texts = [doc.page_content for doc in documents]
46
- embedded_texts = embeddings.embed_documents(texts)
47
- faiss_input = [(embedding, text) for embedding, text in zip(embedded_texts, texts)]
48
- st.session_state.vectors = FAISS.from_embeddings(faiss_input)
49
 
50
- st.title("ChatGroq Demo")
51
- groq_api_key = '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'
52
- llm = ChatGroq(groq_api_key=groq_api_key, model_name="mixtral-8x7b-32768")
53
 
54
- prompt = ChatPromptTemplate.from_template(
55
- """
56
- Answer the questions based on the provided context only.
57
- Please provide the most accurate response based on the question
58
- <context>
59
- {context}
60
- <context>
61
- Questions: {input}
62
- """
63
- )
64
 
65
- document_chain = create_stuff_documents_chain(llm, prompt)
66
- retriever = st.session_state.vectors.as_retriever()
67
- retrieval_chain = create_retrieval_chain(retriever, document_chain)
 
68
 
69
- user_prompt = st.text_input("Input your prompt here")
 
 
70
 
71
- if user_prompt:
72
- start = time.process_time()
73
- try:
74
- response = retrieval_chain.invoke({"input": user_prompt})
75
- print("Response time:", time.process_time() - start)
76
- st.write(response['answer'])
77
 
78
- # With a Streamlit expander
79
- with st.expander("Document Similarity Search"):
80
- for doc in response.get("context", []):
81
- st.write(doc.page_content)
82
- st.write("--------------------------------")
83
- except Exception as e:
84
- st.error(f"Error during retrieval or response generation: {e}")
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urljoin, urlparse
4
+ import json
5
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # Function to crawl a web page
8
+ def crawl(base_url, depth):
9
+ visited = set()
10
+ queue = [(base_url, 0)]
11
+ results = []
12
+ base_netloc = urlparse(base_url).netloc
 
 
13
 
14
+ while queue:
15
+ current_url, current_depth = queue.pop(0)
16
+ if current_depth > depth:
17
+ continue
18
 
19
+ if current_url in visited:
20
+ continue
21
+
22
+ visited.add(current_url)
23
+ try:
24
+ response = requests.get(current_url)
25
+ soup = BeautifulSoup(response.content, 'html.parser')
26
+ text = soup.get_text()
27
 
28
+ results.append({'url': current_url, 'content': text})
 
29
 
30
+ # Find all links on the page
31
+ for link in soup.find_all('a', href=True):
32
+ href = link['href']
33
+ full_url = urljoin(current_url, href)
34
+ # Check if the link is within the base domain
35
+ if urlparse(full_url).netloc == base_netloc:
36
+ if full_url not in visited:
37
+ queue.append((full_url, current_depth + 1))
38
 
39
+ except Exception as e:
40
+ print(f"Failed to fetch {current_url}: {e}")
41
 
42
+ return results
 
 
 
 
43
 
44
+ # Streamlit application
45
+ st.title("Custom Web Crawler Demo")
 
46
 
47
+ depth = st.slider("Depth", min_value=1, max_value=5, value=2)
48
+ base_url = st.text_input("Enter Base URL", "https://docs.nvidia.com/cuda/")
 
 
 
 
 
 
 
 
49
 
50
+ if st.button("Crawl"):
51
+ with st.spinner('Crawling...'):
52
+ data = crawl(base_url, depth)
53
+ st.write(f"Found {len(data)} pages")
54
 
55
+ # Optionally save the results to a JSON file
56
+ with open('crawled_data.json', 'w') as f:
57
+ json.dump(data, f, indent=2)
58
 
59
+ st.write(data)
 
 
 
 
 
60
 
61
+ # Display the first page's content for demo purposes
62
+ if data:
63
+ st.write("First page content:")
64
+ st.write(data[0]['content'])