selfDotOsman commited on
Commit
54e8517
·
1 Parent(s): 6c5096e
Files changed (7) hide show
  1. RAG.py +21 -0
  2. README.md +2 -2
  3. agent.py +32 -0
  4. app.py +19 -0
  5. chains_and_vars.py +55 -0
  6. requirements.txt +9 -0
  7. scraper.py +32 -0
RAG.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.schema import Document
2
+ from langchain.vectorstores import Chroma
3
+ from langchain.document_loaders import TextLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.vectorstores import FAISS
6
+ from chains_and_vars import final_chain, embedder
7
+
8
+ def RAG(page_info,instruction):
9
+ #loading faze
10
+ loader = [Document(page_content=page_info)]
11
+ #indexing.splitting
12
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1200,
13
+ chunk_overlap=200)
14
+ splits = splitter.split_documents(loader)
15
+ #embedding
16
+ #vector db plus retrieval mechanism/engine
17
+ vector_db = FAISS.from_documents(splits, embedder)
18
+ relevant_info = vector_db.max_marginal_relevance_search(instruction,1)
19
+ response = final_chain.run(context=relevant_info,question=instruction)
20
+ del vector_db
21
+ return response
README.md CHANGED
@@ -9,5 +9,5 @@ app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  license: mit
11
  ---
12
+ # Web-sight
13
+ Web-sight is uses RAG (Retrieval Augmented Generation) to retrieve and analyze content from any website
agent.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, requests,ast
2
+ from chains_and_vars import *
3
+ from scraper import extract_info
4
+ from RAG import RAG
5
+
6
+
7
+ def Model(user_query):
8
+ is_link = PromptTemplate(
9
+ input_variables=['query'],
10
+ template = prompt.format(user_query="{query}")
11
+ )
12
+ try:
13
+ link_chain = LLMChain(llm=model, prompt=is_link)
14
+ answer = link_chain.run(user_query)
15
+ if 'yes' in answer.lower():
16
+ if_open = chain3.run(user_query)
17
+ print("if_open: ",if_open)
18
+ if 'yes' in if_open.lower():
19
+ find_links = chain1.run(user_query)
20
+ print(find_links)
21
+ links = re.findall(r'\[.*?\]',find_links)[0]
22
+ links = ast.literal_eval(links)
23
+ content = extract_info(links[0])
24
+ instruction = chain2.run(user_query)
25
+ response = RAG(content,instruction)
26
+ return response
27
+ else:
28
+ return system_prompt_chain.run(user_query, callbacks=callbacks)
29
+ else:
30
+ return system_prompt_chain.run(user_query, callbacks=callbacks)
31
+ except:
32
+ return "The was a responding to your query. Please try again later!"
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from agent import Model
3
+
4
+ def process_query(query):
5
+ return Model(query)
6
+
7
+ # Create the Gradio interface
8
+ iface = gr.Interface(
9
+ fn=process_query,
10
+ inputs=gr.Textbox(lines=2, placeholder="Speak to Web-sight..."),
11
+ outputs="text",
12
+ title="Welcome to Web-sight 👀👋",
13
+ description="Web-sight is uses **RAG(Retrieval Augmented Generation)** to retrieve and analyze content from website. \n\n\n Simply provide Web-sight with a clear instruction and include the URL of the website you'd like to gte information about.",
14
+ theme="default",
15
+ allow_flagging="never"
16
+ )
17
+
18
+ # Launch the app
19
+ iface.launch(share=True)
chains_and_vars.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.llms import HuggingFaceEndpoint
2
+ from langchain.chains import LLMChain
3
+ from langchain.prompts import PromptTemplate
4
+ import os
5
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+
8
+
9
+ from huggingface_hub import login
10
+ token = os.getenv("HUGGING_FACE_HUB_TOKEN")
11
+ login(token)
12
+
13
+ callbacks = [StreamingStdOutCallbackHandler()]
14
+ embedder = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')
15
+
16
+ model = HuggingFaceEndpoint(
17
+ endpoint_url="https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2",
18
+ huggingfacehub_api_token=token,
19
+ verbose=True,
20
+ temperature=0.25,
21
+ top_p=0.9,
22
+ repetition_penalty=1.2
23
+ )
24
+ system_prompt = PromptTemplate(
25
+ template="System: Your name is Web-sight. You were designed to extract content from a websites. If a user's instruction does contain an instruction that requires you to visit a website, answer them but remind them that you are a website analyser and you would like them to add a link or url to their query so you can look at the website. \n\n Human: {user_query} \n\n AI: ",
26
+ input_variables=['user_query']
27
+ )
28
+
29
+ prompt = PromptTemplate(
30
+ template= "System: You are an expert link checker. Your task is to determine if there is any link, URL, or webpage (such as those containing 'http', 'https', 'www', or '.com', '.org', etc.) in the user's query. If you find any link or URL, answer 'yes'. If you find none, answer 'no'. Do not provide any explanation, just answer 'yes' or 'no'. \n\n Human: {user_query} \n\n Is there a link to a website in the above text? Answer strictly 'yes' or 'no': \n\n AI:",
31
+ input_variables=['user_query']
32
+ )
33
+ prompt_1 = PromptTemplate(
34
+ template= "You are a link extractor AI. Your job is to only look for links in from user messages. Given a message from a user, extract all the links in the query in the format: ['link','link'] ALWAYS respond with only a link. \n\n Human: {user_query} \n Please, extract all the links in the above message. Respond with the links in the right format \n\n AI:",
35
+ input_variables=['user_query']
36
+ )
37
+ prompt_2= PromptTemplate(
38
+ template= "You are an instruction extractor AI. Given an instruction from a user, Summarize what the user is asking for. Don't give any further explanation \n\n Human: {user_query} \n\n AI:",
39
+ input_variables=['user_query']
40
+ )
41
+ prompt_3= PromptTemplate(
42
+ template= "You are an instruction validator AI. Your job is to check whether the user's query requires as to know the content from a websit. Answer strictly yes or no in one word \n\n Human: {user_query} \n Does the above message require us to know the content of the site?? Answer strictly yes or no in one word \n\n AI:",
43
+ input_variables=['user_query']
44
+ )
45
+
46
+ final_prompt = PromptTemplate(
47
+ template="System: This is context collected from a website. Summarize the given context as an answer to the question provided. Be as detailed as you wish. When refering to the context, always use 'website' or 'webpage'. For example, according to the website, \n\n Context: {context} \n\n Question: {question} \n\n Answer:",
48
+ input_variables=['context','question'])
49
+
50
+ system_prompt_chain = LLMChain(llm=model, prompt=system_prompt)
51
+ chain = LLMChain(llm=model, prompt=prompt)
52
+ chain1 = LLMChain(llm=model, prompt=prompt_1)
53
+ chain2 = LLMChain(llm=model, prompt=prompt_2)
54
+ chain3 = LLMChain(llm=model, prompt=prompt_3)
55
+ final_chain = LLMChain(llm=model, prompt=final_prompt)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4==4.12.3
2
+ gradio==4.42.0
3
+ huggingface_hub==0.21.4
4
+ langchain==0.2.14
5
+ langchain-community
6
+ transformers
7
+ faiss-cpu
8
+ sentence-transformers
9
+ Requests==2.32.3
scraper.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import re
4
+
5
+ def extract_info(url):
6
+ try:
7
+ response = requests.get(url)
8
+ response.raise_for_status()
9
+ except requests.RequestException as e:
10
+ print(f"Error fetching the website: {e}")
11
+ return None
12
+
13
+ soup = BeautifulSoup(response.text, 'html.parser')
14
+
15
+ # Remove common noise elements
16
+ for element in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
17
+ element.decompose()
18
+
19
+ # Find the main content area (adjust as needed for specific websites)
20
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|body'))
21
+
22
+ if not main_content:
23
+ main_content = soup.body
24
+
25
+ important_text = []
26
+ for elem in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
27
+ text = elem.get_text(strip=True)
28
+ if text: # Ignore empty paragraphs
29
+ important_text.append(text)
30
+
31
+ return ' '.join(important_text)
32
+