File size: 5,282 Bytes
dee16ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from chromadb.api.types import EmbeddingFunction
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
import requests
from bs4 import BeautifulSoup
import spacy
import chromadb
from utils import chromadb_client

# Load environment variables from the .env file
load_dotenv()

# Global constants
WATSONX_URL = "https://us-south.ml.cloud.ibm.com"
API_KEY = os.getenv("api_key")
PROJECT_ID = os.getenv("project_id")

# Validate environment variables
if not API_KEY or not PROJECT_ID:
    raise ValueError("API Key or Project ID is missing from the .env file.")

# Model and decoding parameters
MODEL_PARAMS = {
    "model_type": "meta-llama/llama-3-70b-instruct",
    "max_tokens": 100,
    "min_tokens": 50,
    "decoding": DecodingMethods.GREEDY,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 1,
}

# Set up cache directory
CACHE_DIR = os.path.join(os.getcwd(), ".cache")
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ["HF_HOME"] = CACHE_DIR

# Load sentence-transformers model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(MODEL_NAME, cache_folder=CACHE_DIR)

# Embedding function
class MiniLML6V2EmbeddingFunction(EmbeddingFunction):
    MODEL = embedding_model

    def __call__(self, texts):
        return MiniLML6V2EmbeddingFunction.MODEL.encode(texts).tolist()


def get_model(params):
    """Retrieve the IBM WatsonX LLM model with specified parameters."""
    generate_params = {
        GenParams.MAX_NEW_TOKENS: params["max_tokens"],
        GenParams.MIN_NEW_TOKENS: params["min_tokens"],
        GenParams.DECODING_METHOD: params["decoding"],
        GenParams.TEMPERATURE: params["temperature"],
        GenParams.TOP_K: params["top_k"],
        GenParams.TOP_P: params["top_p"],
    }

    return Model(
        model_id=params["model_type"],
        params=generate_params,
        credentials={"apikey": API_KEY, "url": WATSONX_URL},
        project_id=PROJECT_ID,
    )


def extract_text(url):
    """Scrape text from a given URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        paragraphs = [p.get_text() for p in soup.find_all("p")]
        cleaned_text = " ".join(paragraphs).replace("\xa0", " ")
        return cleaned_text
    except Exception as e:
        raise RuntimeError(f"Failed to extract text from {url}: {e}")


def split_text_into_sentences(text):
    """Split text into sentences using SpaCy."""
    nlp = spacy.load("en_core_web_md")
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]


def create_embedding(url, collection_name, client):
    """Create embeddings for the text scraped from a URL."""
    text = extract_text(url)
    sentences = split_text_into_sentences(text)
    collection = client.get_or_create_collection(collection_name)
    collection.upsert(
        documents=sentences,
        metadatas=[{"source": str(i)} for i in range(len(sentences))],
        ids=[str(i) for i in range(len(sentences))],
    )
    return collection


def create_prompt(url, question, collection_name, client):
    """Generate a prompt using the embedded collection."""
    try:
        collection = create_embedding(url, collection_name, client)
        relevant_chunks = collection.query(query_texts=[question], n_results=5)
        context = "\n\n\n".join(relevant_chunks["documents"][0])
        prompt = (
            "<|begin_of_text|>\n"
            "<|start_header_id|>system<|end_header_id|>\n"
            "You are a helpful AI assistant.\n"
            "<|eot_id|>\n"
            "<|start_header_id|>user<|end_header_id|>\n"
            f"### Context:\n{context}\n\n"
            f"### Instruction:\n"
            f"Please answer the following question based on the above context. Your answer should be concise and directly address the question. "
            f"If the question is unanswerable based on the given context, respond with 'unanswerable'.\n\n"
            f"### Question:\n{question}\n"
            "<|eot_id|>\n"
            "<|start_header_id|>assistant<|end_header_id|>\n"
        )
        return prompt
    except Exception as e:
        raise RuntimeError(f"Error creating prompt: {e}")


def answer_questions_from_web(url, question, collection_name, client):
    """Answer questions by querying WatsonX with relevant context."""
    model = get_model(MODEL_PARAMS)
    prompt = create_prompt(url, question, collection_name, client)
    generated_response = model.generate(prompt=prompt)
    return generated_response["results"][0]["generated_text"].strip()


def main():
    """Main function to run the RAG process."""
    client = chromadb_client()
    url = "https://huggingface.co/learn/nlp-course/chapter1/2?fw=pt"
    question = "What is NLP?"
    collection_name = "test_web_RAG"
    response = answer_questions_from_web(url, question, collection_name, client)
    print("Generated Response:")
    print(response)


if __name__ == "__main__":
    main()