Spaces:

LunaticMaestro
/

rag

Sleeping

rag / z_generate.py

Deepak Sahu

updating references

3bfe553 over 1 year ago

6.25 kB

	import ast
	from huggingface_hub import InferenceClient
	import os
	from typing import List
	import requests
	from bs4 import BeautifulSoup

	class ServerlessInference:
	'''
	Interface to the HF serverless inference API
	'''
	def __init__(self, vector_store_text = None, vector_store_images = None):
	self.model:str = "HuggingFaceH4/zephyr-7b-beta"
	self.client = InferenceClient(api_key=os.getenv("HF_SERVELESS_API"))
	self.vs_text = vector_store_text
	self.vs_images = vector_store_images

	def test(self, query:str) -> str:
	'''Responds to generic query using llm'''
	messages:list = [
	{
	"role": "user",
	"content": query
	}
	]
	completion = self.client.chat.completions.create(
	model=self.model,
	messages=messages,
	max_tokens=500
	)

	return completion.choices[0].message.content

	def perform_rag(self, query:str):
	# First perform text search
	# Retrieval
	retrieved_docs = self.vs_text.similarity_search(query=query, k=5)
	retrieved_docs_text = [doc.page_content for doc in retrieved_docs] # We only need the text of the documents
	context = "\nExtracted documents:\n"
	context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

	# Augmented Generation
	messages:list = [
	{
	"role": "system",
	"content": """Using the information contained in the context,

	give a comprehensive answer to the question.

	Respond only to the question asked, response should be concise and relevant to the question.

	If the answer cannot be deduced from the context, do not give an answer. Instead say `Theres lack of information in document source.`""",

	},

	{
	"role": "user",
	"content": """Context:

	{context}

	---

	Now here is the question you need to answer.

	Question: {question}""".format(context=context, question=query),

	},
	]

	completion = self.client.chat.completions.create(
	model=self.model,
	messages=messages,
	max_tokens=500
	)

	response_text = completion.choices[0].message.content

	# Image retrieval
	retrieved_image = self.vs_images.similarity_search(query=query, k=5)
	retrieved_docs_text = [doc.page_content for doc in retrieved_image] # We only need the text of the documents
	context = "\nExtracted Images:\n"
	context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])


	messages:list = [
	{
	"role": "system",
	"content": """Using the information contained in the context about the images stored in the database,

	give a list of identifiers of the image that best represent the kind of information seeked by the document description of user.

	Respond only to the question asked. Provide only number(s) of the source images relevant to the question.

	If the image is relevant to the question then output format should be a list [1, 3, 0]

	otherwise just reply empty list that is []""",

	},

	{
	"role": "user",
	"content": """Context:
	Extracted Images:

	Document 0:::
	Rahuls playing football

	Document 1:::
	Rahul recieving award in Archery.

	---

	Now here is the question you need to answer.

	Document Description: Rahul is excellent player of archery and great poet."""

	},
	{
	"role": "assistant",
	"content": "[1, ]"
	},

	{
	"role": "user",
	"content": """Context:

	{context}

	---

	Now here is the question you need to answer.

	Document Description: {response_text}""".format(context=context, response_text=response_text),

	},
	]

	completion = self.client.chat.completions.create(
	model=self.model,
	messages=messages,
	max_tokens=500
	)
	try:
	images_list_str: str = completion.choices[0].message.content
	images_list:list = parse(images_list_str)
	# Create link and caption pair
	response_images = []
	for idx in images_list:
	caption = retrieved_image[idx].page_content
	url = get_wiki_file_to_image_url(retrieved_image[idx].metadata["url"])
	response_images.append(
	(url, caption)
	)
	except Exception as e:
	print("Error in parsing suggeted images, ",images_list)
	response_images = []

	return response_text, response_images


	def parse(value: str) -> List[int]:
	"""
	Extracts a list of numbers from the given string.

	Parameters:
	value (str): The input string containing the list of numbers.

	Returns:
	list: A list of numbers if found, otherwise an empty list.
	"""
	try:
	# Find the substring that looks like a list
	start = value.index('[')
	end = value.index(']')
	# Extract and parse it into a Python list
	return ast.literal_eval(value[start:end+1])
	except (ValueError, SyntaxError):
	# Return an empty list if parsing fails
	return []


	def get_wiki_file_to_image_url(file_page_url:str):
	# Headers to mimic a browser
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
	}

	# Step 1: Get the file page HTML
	response = requests.get(file_page_url, headers=headers)

	if response.status_code == 200:
	# Parse the HTML content
	soup = BeautifulSoup(response.content, "html.parser")

	# Step 2: Find the link to the image file
	image_tag = soup.find("a", {"class": "internal"})
	if image_tag and "href" in image_tag.attrs:
	direct_image_url = "https:" + image_tag["href"]

	return direct_image_url

	else:
	return file_page_url