Spaces:

selfDotOsman
/

Web-sight

Sleeping

App Files Files Community

selfDotOsman commited on Aug 27, 2024

Commit

54e8517

1 Parent(s): 6c5096e

done

Browse files

Files changed (7) hide show

RAG.py +21 -0
README.md +2 -2
agent.py +32 -0
app.py +19 -0
chains_and_vars.py +55 -0
requirements.txt +9 -0
scraper.py +32 -0

RAG.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from langchain.schema import Document
+from langchain.vectorstores import Chroma
+from langchain.document_loaders import TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+from chains_and_vars import final_chain, embedder
+def RAG(page_info,instruction):
+  #loading faze
+  loader = [Document(page_content=page_info)]
+  #indexing.splitting
+  splitter = RecursiveCharacterTextSplitter(chunk_size=1200,
+                                          chunk_overlap=200)
+  splits = splitter.split_documents(loader)
+  #embedding
+  #vector db plus retrieval mechanism/engine
+  vector_db = FAISS.from_documents(splits, embedder)
+  relevant_info = vector_db.max_marginal_relevance_search(instruction,1)
+  response = final_chain.run(context=relevant_info,question=instruction)
+  del vector_db
+  return response

README.md CHANGED Viewed

@@ -9,5 +9,5 @@ app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 license: mit
 ---
+# Web-sight
+Web-sight is uses RAG (Retrieval Augmented Generation) to retrieve and analyze content from any website

agent.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import re, requests,ast
+from chains_and_vars import *
+from scraper import extract_info
+from RAG import RAG
+def Model(user_query):
+    is_link = PromptTemplate(
+    input_variables=['query'],
+    template = prompt.format(user_query="{query}")
+    )
+    try:
+      link_chain = LLMChain(llm=model, prompt=is_link)
+      answer = link_chain.run(user_query)
+      if 'yes' in answer.lower():
+        if_open = chain3.run(user_query)
+        print("if_open: ",if_open)
+        if 'yes' in if_open.lower():
+          find_links = chain1.run(user_query)
+          print(find_links)
+          links = re.findall(r'\[.*?\]',find_links)[0]
+          links = ast.literal_eval(links)
+          content = extract_info(links[0])
+          instruction = chain2.run(user_query)
+          response = RAG(content,instruction)
+          return response
+        else:
+          return system_prompt_chain.run(user_query, callbacks=callbacks)
+      else:
+        return system_prompt_chain.run(user_query, callbacks=callbacks)
+    except:
+      return "The was a responding to your query. Please try again later!"

app.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import gradio as gr
+from agent import Model
+def process_query(query):
+    return Model(query)
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=process_query,
+    inputs=gr.Textbox(lines=2, placeholder="Speak to Web-sight..."),
+    outputs="text",
+    title="Welcome to Web-sight 👀👋",
+    description="Web-sight is uses **RAG(Retrieval Augmented Generation)** to retrieve and analyze content from website. \n\n\n Simply provide Web-sight with a clear instruction and include the URL of the website you'd like to gte information about.",
+    theme="default",
+    allow_flagging="never"
+)
+# Launch the app
+iface.launch(share=True)

chains_and_vars.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from langchain.llms import HuggingFaceEndpoint
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
+import os
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.embeddings import HuggingFaceEmbeddings
+from huggingface_hub import login
+token = os.getenv("HUGGING_FACE_HUB_TOKEN")
+login(token)
+callbacks = [StreamingStdOutCallbackHandler()]
+embedder = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')
+model = HuggingFaceEndpoint(
+    endpoint_url="https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2",
+    huggingfacehub_api_token=token,
+    verbose=True,
+    temperature=0.25,
+    top_p=0.9,
+    repetition_penalty=1.2
+)
+system_prompt = PromptTemplate(
+    template="System: Your name is Web-sight. You were designed to extract content from a websites. If a user's instruction does contain an instruction that requires you to visit a website, answer them but remind them that you are a website analyser and you would like them to add a link or url to their query so you can look at the website. \n\n Human: {user_query} \n\n AI: ",
+    input_variables=['user_query']
+)
+prompt = PromptTemplate(
+    template= "System: You are an expert link checker. Your task is to determine if there is any link, URL, or webpage (such as those containing 'http', 'https', 'www', or '.com', '.org', etc.) in the user's query. If you find any link or URL, answer 'yes'. If you find none, answer 'no'. Do not provide any explanation, just answer 'yes' or 'no'. \n\n Human: {user_query} \n\n Is there a link to a website in the above text? Answer strictly 'yes' or 'no': \n\n AI:",
+    input_variables=['user_query']
+)
+prompt_1 = PromptTemplate(
+    template= "You are a link extractor AI. Your job is to only look for links in from user messages. Given a message from a user, extract all the links in the query in the format: ['link','link'] ALWAYS respond with only a link. \n\n Human: {user_query} \n Please, extract all the links in the above message. Respond with the links in the right format \n\n AI:",
+    input_variables=['user_query']
+    )
+prompt_2= PromptTemplate(
+    template= "You are an instruction extractor AI. Given an instruction from a user, Summarize what the user is asking for. Don't give any further explanation \n\n Human: {user_query} \n\n AI:",
+    input_variables=['user_query']
+    )
+prompt_3= PromptTemplate(
+    template= "You are an instruction validator AI. Your job is to check whether the user's query requires as to know the content from a websit. Answer strictly yes or no in one word \n\n Human: {user_query} \n Does the above message require us to know the content of the site?? Answer strictly yes or no in one word \n\n AI:",
+    input_variables=['user_query']
+    )
+final_prompt = PromptTemplate(
+    template="System: This is context collected from a website. Summarize the given context as an answer to the question provided. Be as detailed as you wish. When refering to the context, always use 'website' or 'webpage'. For example, according to the website, \n\n Context: {context} \n\n Question: {question} \n\n Answer:",
+    input_variables=['context','question'])
+system_prompt_chain = LLMChain(llm=model, prompt=system_prompt)
+chain = LLMChain(llm=model, prompt=prompt)
+chain1 = LLMChain(llm=model, prompt=prompt_1)
+chain2 = LLMChain(llm=model, prompt=prompt_2)
+chain3 = LLMChain(llm=model, prompt=prompt_3)
+final_chain = LLMChain(llm=model, prompt=final_prompt)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+beautifulsoup4==4.12.3
+gradio==4.42.0
+huggingface_hub==0.21.4
+langchain==0.2.14
+langchain-community
+transformers
+faiss-cpu
+sentence-transformers
+Requests==2.32.3

scraper.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import requests
+from bs4 import BeautifulSoup
+import re
+def extract_info(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+    except requests.RequestException as e:
+        print(f"Error fetching the website: {e}")
+        return None
+    soup = BeautifulSoup(response.text, 'html.parser')
+    # Remove common noise elements
+    for element in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
+        element.decompose()
+    # Find the main content area (adjust as needed for specific websites)
+    main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|body'))
+    if not main_content:
+        main_content = soup.body
+    important_text = []
+    for elem in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+        text = elem.get_text(strip=True)
+        if text:  # Ignore empty paragraphs
+            important_text.append(text)
+    return ' '.join(important_text)