Chat_with_URL1 / webchat.py
RAHMAN00700's picture
first commit
dee16ac
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from chromadb.api.types import EmbeddingFunction
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
import requests
from bs4 import BeautifulSoup
import spacy
import chromadb
from utils import chromadb_client
# Load environment variables from the .env file
load_dotenv()
# Global constants
WATSONX_URL = "https://us-south.ml.cloud.ibm.com"
API_KEY = os.getenv("api_key")
PROJECT_ID = os.getenv("project_id")
# Validate environment variables
if not API_KEY or not PROJECT_ID:
raise ValueError("API Key or Project ID is missing from the .env file.")
# Model and decoding parameters
MODEL_PARAMS = {
"model_type": "meta-llama/llama-3-70b-instruct",
"max_tokens": 100,
"min_tokens": 50,
"decoding": DecodingMethods.GREEDY,
"temperature": 0.7,
"top_k": 50,
"top_p": 1,
}
# Set up cache directory
CACHE_DIR = os.path.join(os.getcwd(), ".cache")
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ["HF_HOME"] = CACHE_DIR
# Load sentence-transformers model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(MODEL_NAME, cache_folder=CACHE_DIR)
# Embedding function
class MiniLML6V2EmbeddingFunction(EmbeddingFunction):
MODEL = embedding_model
def __call__(self, texts):
return MiniLML6V2EmbeddingFunction.MODEL.encode(texts).tolist()
def get_model(params):
"""Retrieve the IBM WatsonX LLM model with specified parameters."""
generate_params = {
GenParams.MAX_NEW_TOKENS: params["max_tokens"],
GenParams.MIN_NEW_TOKENS: params["min_tokens"],
GenParams.DECODING_METHOD: params["decoding"],
GenParams.TEMPERATURE: params["temperature"],
GenParams.TOP_K: params["top_k"],
GenParams.TOP_P: params["top_p"],
}
return Model(
model_id=params["model_type"],
params=generate_params,
credentials={"apikey": API_KEY, "url": WATSONX_URL},
project_id=PROJECT_ID,
)
def extract_text(url):
"""Scrape text from a given URL."""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
paragraphs = [p.get_text() for p in soup.find_all("p")]
cleaned_text = " ".join(paragraphs).replace("\xa0", " ")
return cleaned_text
except Exception as e:
raise RuntimeError(f"Failed to extract text from {url}: {e}")
def split_text_into_sentences(text):
"""Split text into sentences using SpaCy."""
nlp = spacy.load("en_core_web_md")
doc = nlp(text)
return [sent.text.strip() for sent in doc.sents]
def create_embedding(url, collection_name, client):
"""Create embeddings for the text scraped from a URL."""
text = extract_text(url)
sentences = split_text_into_sentences(text)
collection = client.get_or_create_collection(collection_name)
collection.upsert(
documents=sentences,
metadatas=[{"source": str(i)} for i in range(len(sentences))],
ids=[str(i) for i in range(len(sentences))],
)
return collection
def create_prompt(url, question, collection_name, client):
"""Generate a prompt using the embedded collection."""
try:
collection = create_embedding(url, collection_name, client)
relevant_chunks = collection.query(query_texts=[question], n_results=5)
context = "\n\n\n".join(relevant_chunks["documents"][0])
prompt = (
"<|begin_of_text|>\n"
"<|start_header_id|>system<|end_header_id|>\n"
"You are a helpful AI assistant.\n"
"<|eot_id|>\n"
"<|start_header_id|>user<|end_header_id|>\n"
f"### Context:\n{context}\n\n"
f"### Instruction:\n"
f"Please answer the following question based on the above context. Your answer should be concise and directly address the question. "
f"If the question is unanswerable based on the given context, respond with 'unanswerable'.\n\n"
f"### Question:\n{question}\n"
"<|eot_id|>\n"
"<|start_header_id|>assistant<|end_header_id|>\n"
)
return prompt
except Exception as e:
raise RuntimeError(f"Error creating prompt: {e}")
def answer_questions_from_web(url, question, collection_name, client):
"""Answer questions by querying WatsonX with relevant context."""
model = get_model(MODEL_PARAMS)
prompt = create_prompt(url, question, collection_name, client)
generated_response = model.generate(prompt=prompt)
return generated_response["results"][0]["generated_text"].strip()
def main():
"""Main function to run the RAG process."""
client = chromadb_client()
url = "https://huggingface.co/learn/nlp-course/chapter1/2?fw=pt"
question = "What is NLP?"
collection_name = "test_web_RAG"
response = answer_questions_from_web(url, question, collection_name, client)
print("Generated Response:")
print(response)
if __name__ == "__main__":
main()