Spaces:

mangoman7002
/

WebAPI

Paused

App Files Files Community

WebAPI / app.py

mangoman7002

Upload 6 files

ad06298 verified about 1 year ago

raw

history blame contribute delete

8 kB

	import time
	import copy
	import os
	from flask import Flask, render_template, request
	import json
	from main import bing_serach, extract_web
	import asyncio
	import requests
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	import numpy as np
	from pymongo import MongoClient
	client = MongoClient('mongodb://localhost:27017/') # Replace with your MongoDB URI

	# Create or access a database
	db = client['webdata'] # Replace 'my_database' with your database name

	collection = db['data']

	def cosine_similarity(vec1, vec2):
	# Compute the dot product between the two vectors
	dot_product = np.dot(vec1, vec2)

	# Compute the magnitude (norm) of the vectors
	norm_vec1 = np.linalg.norm(vec1)
	norm_vec2 = np.linalg.norm(vec2)

	# Compute cosine similarity
	similarity = dot_product / (norm_vec1 * norm_vec2)

	return similarity

	def percentage_similarity(vec1, vec2):
	# Get cosine similarity

	try:
	cosine_sim = cosine_similarity(vec1, vec2)

	# Convert cosine similarity to percentage similarity
	percentage_sim = (cosine_sim + 1) / 2 * 100 # Shift range from [-1,1] to [0,100]

	return percentage_sim
	except:
	return 0
	text_splitter = RecursiveCharacterTextSplitter(
	# Set a really small chunk size, just to show.
	chunk_size=2000,
	chunk_overlap=100,
	length_function=len,
	is_separator_regex=False,
	)
	app = Flask(__name__)
	@app.route("/status", methods=['GET'])
	def status():
	return "OK"

	@app.route("/", methods=['GET','POST'])
	def websearch():
	try:
	if request.args.get('q'):
	query = request.args.get('q')
	ifextract = request.args.get('ifextract')
	try:
	start = int(request.args.get('start'))
	except:
	start = 0
	if ifextract == '1':
	return asyncio.run(bing_serach(query,collection,ifextract=True,start=start))
	elif ifextract == '0':
	return asyncio.run(bing_serach(query,collection,ifextract=False,start=start))
	else:
	return '<h1>Invalid Value of ifextract</h1><br>it can Two Value either 0 or 1<br> for 1 it will provide Webpage Extracted'
	else:
	return '<h1>Enter Valid Query</h1> <br> GET parameters<br>1. q(query) = Search query in quote_plus ex: Is+Mango+Sweet<br>1. ifextract(ifextract) = 0,1 for 1 it will provide extracted webpage for suitable websites<br>2. startIndex(start) =Optional Ender the start index of search query'
	except Exception as e:
	return {'type':'error','message':'Unexpected Error',"detail":str(e)}
	@app.route("/adv",methods=["POST","GET"])
	def adv_make():
	global collection
	args = request.get_json()
	if all(key in args for key in ['long_query', 'short_query']):
	short_query = args["short_query"]
	dataz = asyncio.run(bing_serach(short_query, collection, ifextract=True))
	data = dataz['result']
	with open("r.json",'w') as f:
	f.write(json.dumps(data,indent=4))
	toembed = [z['webpage'] for z in data if "embedding_data" not in z and z['webpage'] != "Some Error while Extracting"]

	# Split these documents into chunks
	toemb = [text_splitter.create_documents([z]) for z in toembed]

	# Flatten the document chunks
	toembz = [sublist.page_content for z in toemb for sublist in z]
	print("Length of Documents")
	print(len(toembz))
	if(len(toembz) > 0):
	data_to_send = {
	"text":toembz
	}
	embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
	if(embedding.status_code != 200):
	return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
	embedding = embedding.json()
	else:
	embedding = {'result':[]}
	data_to_send = {
	"text":[args['long_query']]
	}
	query_embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
	if(query_embedding.status_code != 200):
	return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
	query_embedding = query_embedding.json()
	results = embedding['result']
	current_index=0
	embedding_index = 0
	for index,value in enumerate(dataz['result']):
	if("embedding_data" in dataz['result'][index] and dataz['result'][index]['webpage'] != "Some Error while Extracting"):
	pass
	elif(dataz['result'][index]['webpage'] != "Some Error while Extracting"):
	em_vector = results[embedding_index:embedding_index+len(toemb[current_index])]
	embedding_index+=len(toemb[current_index])
	dataz['result'][index]['embedding_data'] = em_vector
	current_index+=1
	else:
	pass
	final_results = []
	for z in range(len(dataz['result'])):
	thisdata = copy.deepcopy(dataz['result'][z])
	# data['result'][z].pop("embedding")
	collection.update_one({"URL":thisdata['URL']},{"$set":thisdata})
	for z in copy.deepcopy(dataz['result']):
	try:
	for a in copy.deepcopy(z['embedding_data']):
	results.append(a)
	except:
	pass
	results = copy.deepcopy(results)
	for thisr in results:
	thisr['similairy'] = percentage_similarity(thisr['embedding'],query_embedding['result'][0]['embedding'])
	final_results.append(thisr)
	final_results = [z for z in final_results if z['similairy'] > 80]
	final_results = sorted(final_results,key=lambda x:x['similairy'],reverse=True )
	remove_embedding = [z.pop("embedding") for z in final_results]
	dataz['extracts'] = final_results
	return dataz

	else:
	return(json.dumps({"type":'error','message':"long_query and short_query is not in request"},indent=4))

	@app.route("/webpage",methods=["POST","GET"])
	def webpage():
	global collection
	args = request.get_json()
	url = args.get("url",None)
	if(url == None):
	return(json.dumps({'type':'error','message':'url is not provided'},indent=4))
	else:
	previous_data = collection.find_one({"URL":url})
	if(previous_data is None):
	result = {}
	result['URL'] = url
	result['time'] = time.time()
	result['webpage'] = asyncio.run(extract_web(result))
	else:
	time_change = time.time() - previous_data['time']
	if(time_change < 86400):
	result = previous_data
	else:
	result = {}
	result['time'] = time.time()
	result['URL'] = url
	result['webpage'] = asyncio.run(extract_web(result))


	if("embedding_data" not in result and result['webpage'] != "Some Error while Extracting"):
	toemb = text_splitter.create_documents([result['webpage']])
	toembz = [z.page_content for z in toemb]
	data_to_send = {
	"text":toembz
	}

	embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
	if(embedding.status_code != 200):
	return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
	embedding = embedding.json()
	result['embedding_data'] = embedding['result']
	try:
	result.pop("_id")
	except:
	pass
	return(json.dumps(result))
	if __name__ == '__main__':
	app.run(debug=False)