Spaces:
Paused
Paused
File size: 7,995 Bytes
ad06298 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import time
import copy
import os
from flask import Flask, render_template, request
import json
from main import bing_serach, extract_web
import asyncio
import requests
from langchain_text_splitters import RecursiveCharacterTextSplitter
import numpy as np
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017/') # Replace with your MongoDB URI
# Create or access a database
db = client['webdata'] # Replace 'my_database' with your database name
collection = db['data']
def cosine_similarity(vec1, vec2):
# Compute the dot product between the two vectors
dot_product = np.dot(vec1, vec2)
# Compute the magnitude (norm) of the vectors
norm_vec1 = np.linalg.norm(vec1)
norm_vec2 = np.linalg.norm(vec2)
# Compute cosine similarity
similarity = dot_product / (norm_vec1 * norm_vec2)
return similarity
def percentage_similarity(vec1, vec2):
# Get cosine similarity
try:
cosine_sim = cosine_similarity(vec1, vec2)
# Convert cosine similarity to percentage similarity
percentage_sim = (cosine_sim + 1) / 2 * 100 # Shift range from [-1,1] to [0,100]
return percentage_sim
except:
return 0
text_splitter = RecursiveCharacterTextSplitter(
# Set a really small chunk size, just to show.
chunk_size=2000,
chunk_overlap=100,
length_function=len,
is_separator_regex=False,
)
app = Flask(__name__)
@app.route("/status", methods=['GET'])
def status():
return "OK"
@app.route("/", methods=['GET','POST'])
def websearch():
try:
if request.args.get('q'):
query = request.args.get('q')
ifextract = request.args.get('ifextract')
try:
start = int(request.args.get('start'))
except:
start = 0
if ifextract == '1':
return asyncio.run(bing_serach(query,collection,ifextract=True,start=start))
elif ifextract == '0':
return asyncio.run(bing_serach(query,collection,ifextract=False,start=start))
else:
return '<h1>Invalid Value of ifextract</h1><br>it can Two Value either 0 or 1<br> for 1 it will provide Webpage Extracted'
else:
return '<h1>Enter Valid Query</h1> <br> GET parameters<br>1. q(query) = Search query in quote_plus ex: Is+Mango+Sweet<br>1. ifextract(ifextract) = 0,1 for 1 it will provide extracted webpage for suitable websites<br>2. startIndex(start) =Optional Ender the start index of search query'
except Exception as e:
return {'type':'error','message':'Unexpected Error',"detail":str(e)}
@app.route("/adv",methods=["POST","GET"])
def adv_make():
global collection
args = request.get_json()
if all(key in args for key in ['long_query', 'short_query']):
short_query = args["short_query"]
dataz = asyncio.run(bing_serach(short_query, collection, ifextract=True))
data = dataz['result']
with open("r.json",'w') as f:
f.write(json.dumps(data,indent=4))
toembed = [z['webpage'] for z in data if "embedding_data" not in z and z['webpage'] != "Some Error while Extracting"]
# Split these documents into chunks
toemb = [text_splitter.create_documents([z]) for z in toembed]
# Flatten the document chunks
toembz = [sublist.page_content for z in toemb for sublist in z]
print("Length of Documents")
print(len(toembz))
if(len(toembz) > 0):
data_to_send = {
"text":toembz
}
embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
if(embedding.status_code != 200):
return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
embedding = embedding.json()
else:
embedding = {'result':[]}
data_to_send = {
"text":[args['long_query']]
}
query_embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
if(query_embedding.status_code != 200):
return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
query_embedding = query_embedding.json()
results = embedding['result']
current_index=0
embedding_index = 0
for index,value in enumerate(dataz['result']):
if("embedding_data" in dataz['result'][index] and dataz['result'][index]['webpage'] != "Some Error while Extracting"):
pass
elif(dataz['result'][index]['webpage'] != "Some Error while Extracting"):
em_vector = results[embedding_index:embedding_index+len(toemb[current_index])]
embedding_index+=len(toemb[current_index])
dataz['result'][index]['embedding_data'] = em_vector
current_index+=1
else:
pass
final_results = []
for z in range(len(dataz['result'])):
thisdata = copy.deepcopy(dataz['result'][z])
# data['result'][z].pop("embedding")
collection.update_one({"URL":thisdata['URL']},{"$set":thisdata})
for z in copy.deepcopy(dataz['result']):
try:
for a in copy.deepcopy(z['embedding_data']):
results.append(a)
except:
pass
results = copy.deepcopy(results)
for thisr in results:
thisr['similairy'] = percentage_similarity(thisr['embedding'],query_embedding['result'][0]['embedding'])
final_results.append(thisr)
final_results = [z for z in final_results if z['similairy'] > 80]
final_results = sorted(final_results,key=lambda x:x['similairy'],reverse=True )
remove_embedding = [z.pop("embedding") for z in final_results]
dataz['extracts'] = final_results
return dataz
else:
return(json.dumps({"type":'error','message':"long_query and short_query is not in request"},indent=4))
@app.route("/webpage",methods=["POST","GET"])
def webpage():
global collection
args = request.get_json()
url = args.get("url",None)
if(url == None):
return(json.dumps({'type':'error','message':'url is not provided'},indent=4))
else:
previous_data = collection.find_one({"URL":url})
if(previous_data is None):
result = {}
result['URL'] = url
result['time'] = time.time()
result['webpage'] = asyncio.run(extract_web(result))
else:
time_change = time.time() - previous_data['time']
if(time_change < 86400):
result = previous_data
else:
result = {}
result['time'] = time.time()
result['URL'] = url
result['webpage'] = asyncio.run(extract_web(result))
if("embedding_data" not in result and result['webpage'] != "Some Error while Extracting"):
toemb = text_splitter.create_documents([result['webpage']])
toembz = [z.page_content for z in toemb]
data_to_send = {
"text":toembz
}
embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
if(embedding.status_code != 200):
return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
embedding = embedding.json()
result['embedding_data'] = embedding['result']
try:
result.pop("_id")
except:
pass
return(json.dumps(result))
if __name__ == '__main__':
app.run(debug=False) |