Spaces:
Paused
Paused
Upload 6 files
Browse files- Dockerfile +66 -0
- app.py +192 -0
- main.py +104 -0
- pattern_functions.py +28 -0
- patterns.py +9 -0
- requirements.txt +9 -0
Dockerfile
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
WORKDIR /home/
|
| 4 |
+
|
| 5 |
+
COPY ./requirements.txt /home/requirements.txt
|
| 6 |
+
|
| 7 |
+
RUN apt update && \
|
| 8 |
+
apt install -y --no-install-recommends \
|
| 9 |
+
curl \
|
| 10 |
+
git \
|
| 11 |
+
git-lfs \
|
| 12 |
+
libatomic1 \
|
| 13 |
+
locales \
|
| 14 |
+
man \
|
| 15 |
+
nano \
|
| 16 |
+
net-tools \
|
| 17 |
+
openssh-client \
|
| 18 |
+
python3 \
|
| 19 |
+
python3-pip \
|
| 20 |
+
python3-venv \
|
| 21 |
+
sudo \
|
| 22 |
+
vim \
|
| 23 |
+
wget \
|
| 24 |
+
zsh \
|
| 25 |
+
zip \
|
| 26 |
+
unzip \
|
| 27 |
+
ffmpeg \
|
| 28 |
+
imagemagick \
|
| 29 |
+
&& git lfs install \
|
| 30 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 31 |
+
ENV USERNAME=user \
|
| 32 |
+
USER_UID=1000 \
|
| 33 |
+
USER_GID=1000 \
|
| 34 |
+
LANG=C.UTF-8 \
|
| 35 |
+
LC_ALL=C.UTF-8 \
|
| 36 |
+
NVIDIA_VISIBLE_DEVICES=all \
|
| 37 |
+
NVIDIA_DRIVER_CAPABILITIES=all \
|
| 38 |
+
EDITOR=code \
|
| 39 |
+
VISUAL=code \
|
| 40 |
+
GIT_EDITOR="code --wait" \
|
| 41 |
+
OPENVSCODE_SERVER_ROOT=/home/.vscode \
|
| 42 |
+
OPENVSCODE=/home/.vscode/bin/openvscode-server
|
| 43 |
+
ENV DEBIAN_FRONTEND=dialog
|
| 44 |
+
RUN wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb
|
| 45 |
+
RUN sudo dpkg -i libssl1.1_1.1.1f-1ubuntu2_amd64.deb
|
| 46 |
+
RUN wget https://repo.mongodb.org/apt/ubuntu/dists/focal/mongodb-org/8.0/multiverse/binary-amd64/mongodb-org-server_8.0.0_amd64.deb
|
| 47 |
+
RUN sudo apt install ./mongodb-org-server_8.0.0_amd64.deb
|
| 48 |
+
|
| 49 |
+
RUN pip install --no-cache-dir --upgrade -r /home/requirements.txt
|
| 50 |
+
|
| 51 |
+
COPY . .
|
| 52 |
+
WORKDIR /home/user/
|
| 53 |
+
|
| 54 |
+
# Creating the user and usergroup
|
| 55 |
+
RUN groupadd --gid ${USER_GID} ${USERNAME} \
|
| 56 |
+
&& useradd --uid ${USER_UID} --gid ${USERNAME} -m -s /bin/bash ${USERNAME} \
|
| 57 |
+
&& echo ${USERNAME} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USERNAME} \
|
| 58 |
+
&& chmod 0440 /etc/sudoers.d/${USERNAME}
|
| 59 |
+
|
| 60 |
+
RUN chmod g+rw /home && \
|
| 61 |
+
chown -R ${USERNAME}:${USERNAME} /home/${USERNAME}
|
| 62 |
+
RUN mkdir -p /db && chown -R ${USERNAME}:${USERNAME} /db
|
| 63 |
+
USER $USERNAME
|
| 64 |
+
COPY . .
|
| 65 |
+
# Install oh-my-zsh & Init# Start MongoDB and Flask
|
| 66 |
+
CMD ["sh", "-c", "Malloc=system mongod --dbpath /db --logpath mongod.log --fork && flask run --host=0.0.0.0 --port=7860"]
|
app.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import copy
|
| 3 |
+
import os
|
| 4 |
+
from flask import Flask, render_template, request
|
| 5 |
+
import json
|
| 6 |
+
from main import bing_serach, extract_web
|
| 7 |
+
import asyncio
|
| 8 |
+
import requests
|
| 9 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 10 |
+
import numpy as np
|
| 11 |
+
from pymongo import MongoClient
|
| 12 |
+
client = MongoClient('mongodb://localhost:27017/') # Replace with your MongoDB URI
|
| 13 |
+
|
| 14 |
+
# Create or access a database
|
| 15 |
+
db = client['webdata'] # Replace 'my_database' with your database name
|
| 16 |
+
|
| 17 |
+
collection = db['data']
|
| 18 |
+
|
| 19 |
+
def cosine_similarity(vec1, vec2):
|
| 20 |
+
# Compute the dot product between the two vectors
|
| 21 |
+
dot_product = np.dot(vec1, vec2)
|
| 22 |
+
|
| 23 |
+
# Compute the magnitude (norm) of the vectors
|
| 24 |
+
norm_vec1 = np.linalg.norm(vec1)
|
| 25 |
+
norm_vec2 = np.linalg.norm(vec2)
|
| 26 |
+
|
| 27 |
+
# Compute cosine similarity
|
| 28 |
+
similarity = dot_product / (norm_vec1 * norm_vec2)
|
| 29 |
+
|
| 30 |
+
return similarity
|
| 31 |
+
|
| 32 |
+
def percentage_similarity(vec1, vec2):
|
| 33 |
+
# Get cosine similarity
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
cosine_sim = cosine_similarity(vec1, vec2)
|
| 37 |
+
|
| 38 |
+
# Convert cosine similarity to percentage similarity
|
| 39 |
+
percentage_sim = (cosine_sim + 1) / 2 * 100 # Shift range from [-1,1] to [0,100]
|
| 40 |
+
|
| 41 |
+
return percentage_sim
|
| 42 |
+
except:
|
| 43 |
+
return 0
|
| 44 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 45 |
+
# Set a really small chunk size, just to show.
|
| 46 |
+
chunk_size=2000,
|
| 47 |
+
chunk_overlap=100,
|
| 48 |
+
length_function=len,
|
| 49 |
+
is_separator_regex=False,
|
| 50 |
+
)
|
| 51 |
+
app = Flask(__name__)
|
| 52 |
+
@app.route("/status", methods=['GET'])
|
| 53 |
+
def status():
|
| 54 |
+
return "OK"
|
| 55 |
+
|
| 56 |
+
@app.route("/", methods=['GET','POST'])
|
| 57 |
+
def websearch():
|
| 58 |
+
try:
|
| 59 |
+
if request.args.get('q'):
|
| 60 |
+
query = request.args.get('q')
|
| 61 |
+
ifextract = request.args.get('ifextract')
|
| 62 |
+
try:
|
| 63 |
+
start = int(request.args.get('start'))
|
| 64 |
+
except:
|
| 65 |
+
start = 0
|
| 66 |
+
if ifextract == '1':
|
| 67 |
+
return asyncio.run(bing_serach(query,collection,ifextract=True,start=start))
|
| 68 |
+
elif ifextract == '0':
|
| 69 |
+
return asyncio.run(bing_serach(query,collection,ifextract=False,start=start))
|
| 70 |
+
else:
|
| 71 |
+
return '<h1>Invalid Value of ifextract</h1><br>it can Two Value either 0 or 1<br> for 1 it will provide Webpage Extracted'
|
| 72 |
+
else:
|
| 73 |
+
return '<h1>Enter Valid Query</h1> <br> GET parameters<br>1. q(query) = Search query in quote_plus ex: Is+Mango+Sweet<br>1. ifextract(ifextract) = 0,1 for 1 it will provide extracted webpage for suitable websites<br>2. startIndex(start) =Optional Ender the start index of search query'
|
| 74 |
+
except Exception as e:
|
| 75 |
+
return {'type':'error','message':'Unexpected Error',"detail":str(e)}
|
| 76 |
+
@app.route("/adv",methods=["POST","GET"])
|
| 77 |
+
def adv_make():
|
| 78 |
+
global collection
|
| 79 |
+
args = request.get_json()
|
| 80 |
+
if all(key in args for key in ['long_query', 'short_query']):
|
| 81 |
+
short_query = args["short_query"]
|
| 82 |
+
dataz = asyncio.run(bing_serach(short_query, collection, ifextract=True))
|
| 83 |
+
data = dataz['result']
|
| 84 |
+
with open("r.json",'w') as f:
|
| 85 |
+
f.write(json.dumps(data,indent=4))
|
| 86 |
+
toembed = [z['webpage'] for z in data if "embedding_data" not in z and z['webpage'] != "Some Error while Extracting"]
|
| 87 |
+
|
| 88 |
+
# Split these documents into chunks
|
| 89 |
+
toemb = [text_splitter.create_documents([z]) for z in toembed]
|
| 90 |
+
|
| 91 |
+
# Flatten the document chunks
|
| 92 |
+
toembz = [sublist.page_content for z in toemb for sublist in z]
|
| 93 |
+
print("Length of Documents")
|
| 94 |
+
print(len(toembz))
|
| 95 |
+
if(len(toembz) > 0):
|
| 96 |
+
data_to_send = {
|
| 97 |
+
"text":toembz
|
| 98 |
+
}
|
| 99 |
+
embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
|
| 100 |
+
if(embedding.status_code != 200):
|
| 101 |
+
return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
|
| 102 |
+
embedding = embedding.json()
|
| 103 |
+
else:
|
| 104 |
+
embedding = {'result':[]}
|
| 105 |
+
data_to_send = {
|
| 106 |
+
"text":[args['long_query']]
|
| 107 |
+
}
|
| 108 |
+
query_embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
|
| 109 |
+
if(query_embedding.status_code != 200):
|
| 110 |
+
return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
|
| 111 |
+
query_embedding = query_embedding.json()
|
| 112 |
+
results = embedding['result']
|
| 113 |
+
current_index=0
|
| 114 |
+
embedding_index = 0
|
| 115 |
+
for index,value in enumerate(dataz['result']):
|
| 116 |
+
if("embedding_data" in dataz['result'][index] and dataz['result'][index]['webpage'] != "Some Error while Extracting"):
|
| 117 |
+
pass
|
| 118 |
+
elif(dataz['result'][index]['webpage'] != "Some Error while Extracting"):
|
| 119 |
+
em_vector = results[embedding_index:embedding_index+len(toemb[current_index])]
|
| 120 |
+
embedding_index+=len(toemb[current_index])
|
| 121 |
+
dataz['result'][index]['embedding_data'] = em_vector
|
| 122 |
+
current_index+=1
|
| 123 |
+
else:
|
| 124 |
+
pass
|
| 125 |
+
final_results = []
|
| 126 |
+
for z in range(len(dataz['result'])):
|
| 127 |
+
thisdata = copy.deepcopy(dataz['result'][z])
|
| 128 |
+
# data['result'][z].pop("embedding")
|
| 129 |
+
collection.update_one({"URL":thisdata['URL']},{"$set":thisdata})
|
| 130 |
+
for z in copy.deepcopy(dataz['result']):
|
| 131 |
+
try:
|
| 132 |
+
for a in copy.deepcopy(z['embedding_data']):
|
| 133 |
+
results.append(a)
|
| 134 |
+
except:
|
| 135 |
+
pass
|
| 136 |
+
results = copy.deepcopy(results)
|
| 137 |
+
for thisr in results:
|
| 138 |
+
thisr['similairy'] = percentage_similarity(thisr['embedding'],query_embedding['result'][0]['embedding'])
|
| 139 |
+
final_results.append(thisr)
|
| 140 |
+
final_results = [z for z in final_results if z['similairy'] > 80]
|
| 141 |
+
final_results = sorted(final_results,key=lambda x:x['similairy'],reverse=True )
|
| 142 |
+
remove_embedding = [z.pop("embedding") for z in final_results]
|
| 143 |
+
dataz['extracts'] = final_results
|
| 144 |
+
return dataz
|
| 145 |
+
|
| 146 |
+
else:
|
| 147 |
+
return(json.dumps({"type":'error','message':"long_query and short_query is not in request"},indent=4))
|
| 148 |
+
|
| 149 |
+
@app.route("/webpage",methods=["POST","GET"])
|
| 150 |
+
def webpage():
|
| 151 |
+
global collection
|
| 152 |
+
args = request.get_json()
|
| 153 |
+
url = args.get("url",None)
|
| 154 |
+
if(url == None):
|
| 155 |
+
return(json.dumps({'type':'error','message':'url is not provided'},indent=4))
|
| 156 |
+
else:
|
| 157 |
+
previous_data = collection.find_one({"URL":url})
|
| 158 |
+
if(previous_data is None):
|
| 159 |
+
result = {}
|
| 160 |
+
result['URL'] = url
|
| 161 |
+
result['time'] = time.time()
|
| 162 |
+
result['webpage'] = asyncio.run(extract_web(result))
|
| 163 |
+
else:
|
| 164 |
+
time_change = time.time() - previous_data['time']
|
| 165 |
+
if(time_change < 86400):
|
| 166 |
+
result = previous_data
|
| 167 |
+
else:
|
| 168 |
+
result = {}
|
| 169 |
+
result['time'] = time.time()
|
| 170 |
+
result['URL'] = url
|
| 171 |
+
result['webpage'] = asyncio.run(extract_web(result))
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
if("embedding_data" not in result and result['webpage'] != "Some Error while Extracting"):
|
| 175 |
+
toemb = text_splitter.create_documents([result['webpage']])
|
| 176 |
+
toembz = [z.page_content for z in toemb]
|
| 177 |
+
data_to_send = {
|
| 178 |
+
"text":toembz
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
embedding = requests.post("https://mangoman7002-flash-embedding.hf.space",json=data_to_send)
|
| 182 |
+
if(embedding.status_code != 200):
|
| 183 |
+
return json.dumps({"type":"error","message":f"error With API {str(embedding.status_code)}"},indent=4)
|
| 184 |
+
embedding = embedding.json()
|
| 185 |
+
result['embedding_data'] = embedding['result']
|
| 186 |
+
try:
|
| 187 |
+
result.pop("_id")
|
| 188 |
+
except:
|
| 189 |
+
pass
|
| 190 |
+
return(json.dumps(result))
|
| 191 |
+
if __name__ == '__main__':
|
| 192 |
+
app.run(debug=False)
|
main.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fake_useragent import UserAgent
|
| 2 |
+
import copy
|
| 3 |
+
import time
|
| 4 |
+
import asyncio
|
| 5 |
+
import urllib.parse
|
| 6 |
+
import requests
|
| 7 |
+
import json
|
| 8 |
+
import re
|
| 9 |
+
from bs4 import BeautifulSoup as bs
|
| 10 |
+
import urllib
|
| 11 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 12 |
+
from patterns import patterns
|
| 13 |
+
from urllib.parse import urlparse, parse_qs
|
| 14 |
+
ua = UserAgent()
|
| 15 |
+
patterns = patterns()
|
| 16 |
+
async def bing_serach(query, collection, ifextract=False,start:int=0):
|
| 17 |
+
count = "none"
|
| 18 |
+
r = requests.get(f"https://www.google.com/search?q={urllib.parse.quote_plus(query)}")
|
| 19 |
+
r.status_code
|
| 20 |
+
# print(results)
|
| 21 |
+
soup = bs(r.text,'html.parser')
|
| 22 |
+
main = soup.find_all("div", id='main')[0]
|
| 23 |
+
l = [z for z in main.children if z.name == "div"]
|
| 24 |
+
results = []
|
| 25 |
+
for z in l:
|
| 26 |
+
kCrYT = z.find_all("div",class_="kCrYT")
|
| 27 |
+
if(len(kCrYT) > 0):
|
| 28 |
+
try:
|
| 29 |
+
if(len(kCrYT[0].find_all("a")[0].find_all(class_="DnJfK"))>0):
|
| 30 |
+
abstract = kCrYT[1].text
|
| 31 |
+
title = kCrYT[0].find_all("a")[0].find_all(class_="DnJfK")[0].find_all("h3")[0].text
|
| 32 |
+
url = kCrYT[0].find_all("a")[0].get('href')
|
| 33 |
+
results.append({
|
| 34 |
+
'Abstract':abstract,
|
| 35 |
+
'Title':title,
|
| 36 |
+
'URL':parse_qs(urlparse(url).query).get('q',[""])[0]
|
| 37 |
+
})
|
| 38 |
+
except Exception as e:
|
| 39 |
+
try:
|
| 40 |
+
if(len(kCrYT[1].find_all("a")[0].find_all("span")) > 1):
|
| 41 |
+
abstract = kCrYT[0].text
|
| 42 |
+
title = kCrYT[1].find_all("a")[0].find_all("span")[0].text
|
| 43 |
+
url = kCrYT[1].find_all("a")[0].get('href')
|
| 44 |
+
results.append({
|
| 45 |
+
'Abstract':abstract,
|
| 46 |
+
'Title':title,
|
| 47 |
+
'URL':parse_qs(urlparse(url).query).get('q',[""])[0]
|
| 48 |
+
})
|
| 49 |
+
else:
|
| 50 |
+
print("Method 2 Failed")
|
| 51 |
+
except:
|
| 52 |
+
print(e)
|
| 53 |
+
if ifextract:
|
| 54 |
+
for i, result in enumerate(results):
|
| 55 |
+
previous_data = collection.find_one({"URL":result['URL']})
|
| 56 |
+
if(previous_data is None):
|
| 57 |
+
result['webpage'] = asyncio.create_task(extract_web(result))
|
| 58 |
+
result['time'] = time.time()
|
| 59 |
+
|
| 60 |
+
else:
|
| 61 |
+
print(f"This is Taken from cache {result['URL']}\n\n")
|
| 62 |
+
result['webpage'] = previous_data['webpage']
|
| 63 |
+
try:
|
| 64 |
+
result['embedding_data'] = previous_data['embedding_data']
|
| 65 |
+
except:
|
| 66 |
+
print(f"embedding_data not exist in {result['URL']}")
|
| 67 |
+
result['from'] = "cache"
|
| 68 |
+
for result in results:
|
| 69 |
+
try:
|
| 70 |
+
result['webpage'] =await result['webpage']
|
| 71 |
+
except:
|
| 72 |
+
pass
|
| 73 |
+
dummy_result = copy.deepcopy(result) # Creates a completely independent copy
|
| 74 |
+
collection.insert_one(dummy_result)
|
| 75 |
+
# print({'count':count,'result':results})
|
| 76 |
+
return {'count':count,'result':results}
|
| 77 |
+
else:
|
| 78 |
+
return {'count':count,'result':results}
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
async def extract_web(result):
|
| 84 |
+
try:
|
| 85 |
+
try:
|
| 86 |
+
headers = {
|
| 87 |
+
'User-Agent': ua.random
|
| 88 |
+
}
|
| 89 |
+
content = requests.get(result['URL'],headers=headers,verify=False)
|
| 90 |
+
except:
|
| 91 |
+
print("some Error While Initial Request")
|
| 92 |
+
print(content.status_code)
|
| 93 |
+
for pattern in patterns:
|
| 94 |
+
if(re.match(pattern['recode'],result['URL'])):
|
| 95 |
+
thisr = pattern['function'](content)
|
| 96 |
+
result['webpage'] = thisr
|
| 97 |
+
result['time'] = time.time()
|
| 98 |
+
return thisr
|
| 99 |
+
except Exception as e:
|
| 100 |
+
print(str(e))
|
| 101 |
+
return 'Some Error while Extracting'
|
| 102 |
+
# return ('There is some error with This Pattern\n','Pattern Name',pattern['Title'],'\nPattern Id',pattern['id'])
|
| 103 |
+
|
| 104 |
+
|
pattern_functions.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup as bs
|
| 2 |
+
import re
|
| 3 |
+
def extract_1(content):
|
| 4 |
+
finalcontent = ''
|
| 5 |
+
toremove = ['link','script','style','iframe','object','noscript','param','embed','meta','base','canvas','svg']
|
| 6 |
+
content_soup = bs(content.text, 'html.parser')
|
| 7 |
+
for soup_body in content_soup.find_all('body'):
|
| 8 |
+
for remove_tag in toremove:
|
| 9 |
+
for trash_tag in soup_body.find_all(remove_tag):
|
| 10 |
+
trash_tag.decompose()
|
| 11 |
+
thisbody = soup_body.get_text()
|
| 12 |
+
thisbody = thisbody.replace("\t",'')
|
| 13 |
+
thisbody = re.sub(r"\n\w\n",'\n',thisbody)
|
| 14 |
+
while True:
|
| 15 |
+
old_body = thisbody
|
| 16 |
+
thisbody = thisbody.replace(' ', ' ')
|
| 17 |
+
if old_body == thisbody:
|
| 18 |
+
break
|
| 19 |
+
|
| 20 |
+
while True:
|
| 21 |
+
old_body = thisbody
|
| 22 |
+
thisbody = thisbody.replace('\n\n', '\n')
|
| 23 |
+
if old_body == thisbody:
|
| 24 |
+
break
|
| 25 |
+
|
| 26 |
+
finalcontent = finalcontent + thisbody
|
| 27 |
+
print('content Extracted')
|
| 28 |
+
return "\n".join([z.strip() for z in finalcontent.split("\n") if len(z.strip().split(" ")) > 3])
|
patterns.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pattern_functions import *
|
| 2 |
+
def patterns():
|
| 3 |
+
patterns = [{
|
| 4 |
+
'Title':'All Extractor',
|
| 5 |
+
'id':'extract_1',
|
| 6 |
+
'recode':'^https://.*$',
|
| 7 |
+
'function':extract_1
|
| 8 |
+
}]
|
| 9 |
+
return patterns
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
urllib3
|
| 2 |
+
bs4
|
| 3 |
+
requests
|
| 4 |
+
flask
|
| 5 |
+
protobuf
|
| 6 |
+
langchain_text_splitters
|
| 7 |
+
numpy
|
| 8 |
+
pymongo
|
| 9 |
+
fake_useragent
|