mangoman7002 commited on
Commit
ad06298
·
verified ·
1 Parent(s): d1d7ec6

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +66 -0
  2. app.py +192 -0
  3. main.py +104 -0
  4. pattern_functions.py +28 -0
  5. patterns.py +9 -0
  6. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /home/
4
+
5
+ COPY ./requirements.txt /home/requirements.txt
6
+
7
+ RUN apt update && \
8
+ apt install -y --no-install-recommends \
9
+ curl \
10
+ git \
11
+ git-lfs \
12
+ libatomic1 \
13
+ locales \
14
+ man \
15
+ nano \
16
+ net-tools \
17
+ openssh-client \
18
+ python3 \
19
+ python3-pip \
20
+ python3-venv \
21
+ sudo \
22
+ vim \
23
+ wget \
24
+ zsh \
25
+ zip \
26
+ unzip \
27
+ ffmpeg \
28
+ imagemagick \
29
+ && git lfs install \
30
+ && rm -rf /var/lib/apt/lists/*
31
+ ENV USERNAME=user \
32
+ USER_UID=1000 \
33
+ USER_GID=1000 \
34
+ LANG=C.UTF-8 \
35
+ LC_ALL=C.UTF-8 \
36
+ NVIDIA_VISIBLE_DEVICES=all \
37
+ NVIDIA_DRIVER_CAPABILITIES=all \
38
+ EDITOR=code \
39
+ VISUAL=code \
40
+ GIT_EDITOR="code --wait" \
41
+ OPENVSCODE_SERVER_ROOT=/home/.vscode \
42
+ OPENVSCODE=/home/.vscode/bin/openvscode-server
43
+ ENV DEBIAN_FRONTEND=dialog
44
+ RUN wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb
45
+ RUN sudo dpkg -i libssl1.1_1.1.1f-1ubuntu2_amd64.deb
46
+ RUN wget https://repo.mongodb.org/apt/ubuntu/dists/focal/mongodb-org/8.0/multiverse/binary-amd64/mongodb-org-server_8.0.0_amd64.deb
47
+ RUN sudo apt install ./mongodb-org-server_8.0.0_amd64.deb
48
+
49
+ RUN pip install --no-cache-dir --upgrade -r /home/requirements.txt
50
+
51
+ COPY . .
52
+ WORKDIR /home/user/
53
+
54
+ # Creating the user and usergroup
55
+ RUN groupadd --gid ${USER_GID} ${USERNAME} \
56
+ && useradd --uid ${USER_UID} --gid ${USERNAME} -m -s /bin/bash ${USERNAME} \
57
+ && echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
58
+ && chmod 0440 /etc/sudoers.d/${USERNAME}
59
+
60
+ RUN chmod g+rw /home && \
61
+ chown -R ${USERNAME}:${USERNAME} /home/${USERNAME}
62
+ RUN mkdir -p /db && chown -R ${USERNAME}:${USERNAME} /db
63
+ USER $USERNAME
64
+ COPY . .
65
+ # Install oh-my-zsh & Init# Start MongoDB and Flask
66
+ CMD ["sh", "-c", "Malloc=system mongod --dbpath /db --logpath mongod.log --fork && flask run --host=0.0.0.0 --port=7860"]
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import copy
3
+ import os
4
+ from flask import Flask, render_template, request
5
+ import json
6
+ from main import bing_serach, extract_web
7
+ import asyncio
8
+ import requests
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+ import numpy as np
11
+ from pymongo import MongoClient
12
+ client = MongoClient('mongodb://localhost:27017/') # Replace with your MongoDB URI
13
+
14
+ # Create or access a database
15
+ db = client['webdata'] # Replace 'my_database' with your database name
16
+
17
+ collection = db['data']
18
+
19
+ def cosine_similarity(vec1, vec2):
20
+ # Compute the dot product between the two vectors
21
+ dot_product = np.dot(vec1, vec2)
22
+
23
+ # Compute the magnitude (norm) of the vectors
24
+ norm_vec1 = np.linalg.norm(vec1)
25
+ norm_vec2 = np.linalg.norm(vec2)
26
+
27
+ # Compute cosine similarity
28
+ similarity = dot_product / (norm_vec1 * norm_vec2)
29
+
30
+ return similarity
31
+
32
+ def percentage_similarity(vec1, vec2):
33
+ # Get cosine similarity
34
+
35
+ try:
36
+ cosine_sim = cosine_similarity(vec1, vec2)
37
+
38
+ # Convert cosine similarity to percentage similarity
39
+ percentage_sim = (cosine_sim + 1) / 2 * 100 # Shift range from [-1,1] to [0,100]
40
+
41
+ return percentage_sim
42
+ except:
43
+ return 0
44
+ text_splitter = RecursiveCharacterTextSplitter(
45
+ # Set a really small chunk size, just to show.
46
+ chunk_size=2000,
47
+ chunk_overlap=100,
48
+ length_function=len,
49
+ is_separator_regex=False,
50
+ )
51
+ app = Flask(__name__)
52
+ @app.route("/status", methods=['GET'])
53
+ def status():
54
+ return "OK"
55
+
56
+ @app.route("/", methods=['GET','POST'])
57
+ def websearch():
58
+ try:
59
+ if request.args.get('q'):
60
+ query = request.args.get('q')
61
+ ifextract = request.args.get('ifextract')
62
+ try:
63
+ start = int(request.args.get('start'))
64
+ except:
65
+ start = 0
66
+ if ifextract == '1':
67
+ return asyncio.run(bing_serach(query,collection,ifextract=True,start=start))
68
+ elif ifextract == '0':
69
+ return asyncio.run(bing_serach(query,collection,ifextract=False,start=start))
70
+ else:
71
+ return '<h1>Invalid Value of ifextract</h1><br>it can Two Value either 0 or 1<br> for 1 it will provide Webpage Extracted'
72
+ else:
73
+ return '<h1>Enter Valid Query</h1> <br> GET parameters<br>1. q(query) = Search query in quote_plus ex: Is+Mango+Sweet<br>1. ifextract(ifextract) = 0,1 for 1 it will provide extracted webpage for suitable websites<br>2. startIndex(start) =Optional Ender the start index of search query'
74
+ except Exception as e:
75
+ return {'type':'error','message':'Unexpected Error',"detail":str(e)}
76
+ @app.route("/adv",methods=["POST","GET"])
77
+ def adv_make():
78
+ global collection
79
+ args = request.get_json()
80
+ if all(key in args for key in ['long_query', 'short_query']):
81
+ short_query = args["short_query"]
82
+ dataz = asyncio.run(bing_serach(short_query, collection, ifextract=True))
83
+ data = dataz['result']
84
+ with open("r.json",'w') as f:
85
+ f.write(json.dumps(data,indent=4))
86
+ toembed = [z['webpage'] for z in data if "embedding_data" not in z and z['webpage'] != "Some Error while Extracting"]
87
+
88
+ # Split these documents into chunks
89
+ toemb = [text_splitter.create_documents([z]) for z in toembed]
90
+
91
+ # Flatten the document chunks
92
+ toembz = [sublist.page_content for z in toemb for sublist in z]
93
+ print("Length of Documents")
94
+ print(len(toembz))
95
+ if(len(toembz) > 0):
96
+ data_to_send = {
97
+ "text":toembz
98
+ }
99
+ embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
100
+ if(embedding.status_code != 200):
101
+ return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
102
+ embedding = embedding.json()
103
+ else:
104
+ embedding = {'result':[]}
105
+ data_to_send = {
106
+ "text":[args['long_query']]
107
+ }
108
+ query_embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
109
+ if(query_embedding.status_code != 200):
110
+ return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
111
+ query_embedding = query_embedding.json()
112
+ results = embedding['result']
113
+ current_index=0
114
+ embedding_index = 0
115
+ for index,value in enumerate(dataz['result']):
116
+ if("embedding_data" in dataz['result'][index] and dataz['result'][index]['webpage'] != "Some Error while Extracting"):
117
+ pass
118
+ elif(dataz['result'][index]['webpage'] != "Some Error while Extracting"):
119
+ em_vector = results[embedding_index:embedding_index+len(toemb[current_index])]
120
+ embedding_index+=len(toemb[current_index])
121
+ dataz['result'][index]['embedding_data'] = em_vector
122
+ current_index+=1
123
+ else:
124
+ pass
125
+ final_results = []
126
+ for z in range(len(dataz['result'])):
127
+ thisdata = copy.deepcopy(dataz['result'][z])
128
+ # data['result'][z].pop("embedding")
129
+ collection.update_one({"URL":thisdata['URL']},{"$set":thisdata})
130
+ for z in copy.deepcopy(dataz['result']):
131
+ try:
132
+ for a in copy.deepcopy(z['embedding_data']):
133
+ results.append(a)
134
+ except:
135
+ pass
136
+ results = copy.deepcopy(results)
137
+ for thisr in results:
138
+ thisr['similairy'] = percentage_similarity(thisr['embedding'],query_embedding['result'][0]['embedding'])
139
+ final_results.append(thisr)
140
+ final_results = [z for z in final_results if z['similairy'] > 80]
141
+ final_results = sorted(final_results,key=lambda x:x['similairy'],reverse=True )
142
+ remove_embedding = [z.pop("embedding") for z in final_results]
143
+ dataz['extracts'] = final_results
144
+ return dataz
145
+
146
+ else:
147
+ return(json.dumps({"type":'error','message':"long_query and short_query is not in request"},indent=4))
148
+
149
+ @app.route("/webpage",methods=["POST","GET"])
150
+ def webpage():
151
+ global collection
152
+ args = request.get_json()
153
+ url = args.get("url",None)
154
+ if(url == None):
155
+ return(json.dumps({'type':'error','message':'url is not provided'},indent=4))
156
+ else:
157
+ previous_data = collection.find_one({"URL":url})
158
+ if(previous_data is None):
159
+ result = {}
160
+ result['URL'] = url
161
+ result['time'] = time.time()
162
+ result['webpage'] = asyncio.run(extract_web(result))
163
+ else:
164
+ time_change = time.time() - previous_data['time']
165
+ if(time_change < 86400):
166
+ result = previous_data
167
+ else:
168
+ result = {}
169
+ result['time'] = time.time()
170
+ result['URL'] = url
171
+ result['webpage'] = asyncio.run(extract_web(result))
172
+
173
+
174
+ if("embedding_data" not in result and result['webpage'] != "Some Error while Extracting"):
175
+ toemb = text_splitter.create_documents([result['webpage']])
176
+ toembz = [z.page_content for z in toemb]
177
+ data_to_send = {
178
+ "text":toembz
179
+ }
180
+
181
+ embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
182
+ if(embedding.status_code != 200):
183
+ return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
184
+ embedding = embedding.json()
185
+ result['embedding_data'] = embedding['result']
186
+ try:
187
+ result.pop("_id")
188
+ except:
189
+ pass
190
+ return(json.dumps(result))
191
+ if __name__ == '__main__':
192
+ app.run(debug=False)
main.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fake_useragent import UserAgent
2
+ import copy
3
+ import time
4
+ import asyncio
5
+ import urllib.parse
6
+ import requests
7
+ import json
8
+ import re
9
+ from bs4 import BeautifulSoup as bs
10
+ import urllib
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ from patterns import patterns
13
+ from urllib.parse import urlparse, parse_qs
14
+ ua = UserAgent()
15
+ patterns = patterns()
16
+ async def bing_serach(query, collection, ifextract=False,start:int=0):
17
+ count = "none"
18
+ r = requests.get(f"https://www.google.com/search?q={urllib.parse.quote_plus(query)}")
19
+ r.status_code
20
+ # print(results)
21
+ soup = bs(r.text,'html.parser')
22
+ main = soup.find_all("div", id='main')[0]
23
+ l = [z for z in main.children if z.name == "div"]
24
+ results = []
25
+ for z in l:
26
+ kCrYT = z.find_all("div",class_="kCrYT")
27
+ if(len(kCrYT) > 0):
28
+ try:
29
+ if(len(kCrYT[0].find_all("a")[0].find_all(class_="DnJfK"))>0):
30
+ abstract = kCrYT[1].text
31
+ title = kCrYT[0].find_all("a")[0].find_all(class_="DnJfK")[0].find_all("h3")[0].text
32
+ url = kCrYT[0].find_all("a")[0].get('href')
33
+ results.append({
34
+ 'Abstract':abstract,
35
+ 'Title':title,
36
+ 'URL':parse_qs(urlparse(url).query).get('q',[""])[0]
37
+ })
38
+ except Exception as e:
39
+ try:
40
+ if(len(kCrYT[1].find_all("a")[0].find_all("span")) > 1):
41
+ abstract = kCrYT[0].text
42
+ title = kCrYT[1].find_all("a")[0].find_all("span")[0].text
43
+ url = kCrYT[1].find_all("a")[0].get('href')
44
+ results.append({
45
+ 'Abstract':abstract,
46
+ 'Title':title,
47
+ 'URL':parse_qs(urlparse(url).query).get('q',[""])[0]
48
+ })
49
+ else:
50
+ print("Method 2 Failed")
51
+ except:
52
+ print(e)
53
+ if ifextract:
54
+ for i, result in enumerate(results):
55
+ previous_data = collection.find_one({"URL":result['URL']})
56
+ if(previous_data is None):
57
+ result['webpage'] = asyncio.create_task(extract_web(result))
58
+ result['time'] = time.time()
59
+
60
+ else:
61
+ print(f"This is Taken from cache {result['URL']}\n\n")
62
+ result['webpage'] = previous_data['webpage']
63
+ try:
64
+ result['embedding_data'] = previous_data['embedding_data']
65
+ except:
66
+ print(f"embedding_data not exist in {result['URL']}")
67
+ result['from'] = "cache"
68
+ for result in results:
69
+ try:
70
+ result['webpage'] =await result['webpage']
71
+ except:
72
+ pass
73
+ dummy_result = copy.deepcopy(result) # Creates a completely independent copy
74
+ collection.insert_one(dummy_result)
75
+ # print({'count':count,'result':results})
76
+ return {'count':count,'result':results}
77
+ else:
78
+ return {'count':count,'result':results}
79
+
80
+
81
+
82
+
83
+ async def extract_web(result):
84
+ try:
85
+ try:
86
+ headers = {
87
+ 'User-Agent': ua.random
88
+ }
89
+ content = requests.get(result['URL'],headers=headers,verify=False)
90
+ except:
91
+ print("some Error While Initial Request")
92
+ print(content.status_code)
93
+ for pattern in patterns:
94
+ if(re.match(pattern['recode'],result['URL'])):
95
+ thisr = pattern['function'](content)
96
+ result['webpage'] = thisr
97
+ result['time'] = time.time()
98
+ return thisr
99
+ except Exception as e:
100
+ print(str(e))
101
+ return 'Some Error while Extracting'
102
+ # return ('There is some error with This Pattern\n','Pattern Name',pattern['Title'],'\nPattern Id',pattern['id'])
103
+
104
+
pattern_functions.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup as bs
2
+ import re
3
+ def extract_1(content):
4
+ finalcontent = ''
5
+ toremove = ['link','script','style','iframe','object','noscript','param','embed','meta','base','canvas','svg']
6
+ content_soup = bs(content.text, 'html.parser')
7
+ for soup_body in content_soup.find_all('body'):
8
+ for remove_tag in toremove:
9
+ for trash_tag in soup_body.find_all(remove_tag):
10
+ trash_tag.decompose()
11
+ thisbody = soup_body.get_text()
12
+ thisbody = thisbody.replace("\t",'')
13
+ thisbody = re.sub(r"\n\w\n",'\n',thisbody)
14
+ while True:
15
+ old_body = thisbody
16
+ thisbody = thisbody.replace(' ', ' ')
17
+ if old_body == thisbody:
18
+ break
19
+
20
+ while True:
21
+ old_body = thisbody
22
+ thisbody = thisbody.replace('\n\n', '\n')
23
+ if old_body == thisbody:
24
+ break
25
+
26
+ finalcontent = finalcontent + thisbody
27
+ print('content Extracted')
28
+ return "\n".join([z.strip() for z in finalcontent.split("\n") if len(z.strip().split(" ")) > 3])
patterns.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from pattern_functions import *
2
+ def patterns():
3
+ patterns = [{
4
+ 'Title':'All Extractor',
5
+ 'id':'extract_1',
6
+ 'recode':'^https://.*$',
7
+ 'function':extract_1
8
+ }]
9
+ return patterns
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ urllib3
2
+ bs4
3
+ requests
4
+ flask
5
+ protobuf
6
+ langchain_text_splitters
7
+ numpy
8
+ pymongo
9
+ fake_useragent