Spaces:
Sleeping
Sleeping
Commit ·
11dec1a
1
Parent(s): 2f84696
scrapping code changed
Browse files- __pycache__/advance_post.cpython-310.pyc +0 -0
- __pycache__/paraphrase_post.cpython-310.pyc +0 -0
- __pycache__/scrap_post.cpython-310.pyc +0 -0
- advance_post.py +3 -3
- app.py +1 -1
- paraphrase_post.py +1 -1
- scrap_post.py +10 -30
__pycache__/advance_post.cpython-310.pyc
ADDED
|
Binary file (3.46 kB). View file
|
|
|
__pycache__/paraphrase_post.cpython-310.pyc
ADDED
|
Binary file (2.96 kB). View file
|
|
|
__pycache__/scrap_post.cpython-310.pyc
ADDED
|
Binary file (717 Bytes). View file
|
|
|
advance_post.py
CHANGED
|
@@ -10,7 +10,7 @@ import nest_asyncio
|
|
| 10 |
def google_search(linkedin_post,model , google_api_key, search_engine_id , num_results_per_query=[3,2,1]):
|
| 11 |
|
| 12 |
response_schemas = [
|
| 13 |
-
ResponseSchema(name="
|
| 14 |
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
|
| 15 |
format_instructions = output_parser.get_format_instructions()
|
| 16 |
|
|
@@ -29,7 +29,7 @@ def google_search(linkedin_post,model , google_api_key, search_engine_id , num_r
|
|
| 29 |
|
| 30 |
chain = prompt | model | output_parser
|
| 31 |
result=chain.invoke({"post": linkedin_post})
|
| 32 |
-
questions=result['
|
| 33 |
# print(questions)
|
| 34 |
|
| 35 |
all_links = []
|
|
@@ -61,7 +61,7 @@ def google_search(linkedin_post,model , google_api_key, search_engine_id , num_r
|
|
| 61 |
# result=chain.invoke({'post':linkedinpost , 'content':docs})
|
| 62 |
# return result , docs
|
| 63 |
|
| 64 |
-
|
| 65 |
def advanced_post(all_links ,model ,linkedinpost):
|
| 66 |
loader = WebBaseLoader(all_links,encoding="utf-8")
|
| 67 |
loader.requests_per_second = 1
|
|
|
|
| 10 |
def google_search(linkedin_post,model , google_api_key, search_engine_id , num_results_per_query=[3,2,1]):
|
| 11 |
|
| 12 |
response_schemas = [
|
| 13 |
+
ResponseSchema(name="questions", description="These are the top three relevant questions from the LinkedIn post" , type="list")]
|
| 14 |
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
|
| 15 |
format_instructions = output_parser.get_format_instructions()
|
| 16 |
|
|
|
|
| 29 |
|
| 30 |
chain = prompt | model | output_parser
|
| 31 |
result=chain.invoke({"post": linkedin_post})
|
| 32 |
+
questions=result['questions']
|
| 33 |
# print(questions)
|
| 34 |
|
| 35 |
all_links = []
|
|
|
|
| 61 |
# result=chain.invoke({'post':linkedinpost , 'content':docs})
|
| 62 |
# return result , docs
|
| 63 |
|
| 64 |
+
nest_asyncio.apply()
|
| 65 |
def advanced_post(all_links ,model ,linkedinpost):
|
| 66 |
loader = WebBaseLoader(all_links,encoding="utf-8")
|
| 67 |
loader.requests_per_second = 1
|
app.py
CHANGED
|
@@ -3,7 +3,7 @@ import re
|
|
| 3 |
import openai
|
| 4 |
from paraphrase_post import get_original_url , paraphrased_post
|
| 5 |
from advance_post import google_search , advanced_post
|
| 6 |
-
from
|
| 7 |
from langchain_groq import ChatGroq
|
| 8 |
#from langchain import HuggingFaceHub
|
| 9 |
|
|
|
|
| 3 |
import openai
|
| 4 |
from paraphrase_post import get_original_url , paraphrased_post
|
| 5 |
from advance_post import google_search , advanced_post
|
| 6 |
+
from langchain_community.chat_models import ChatOpenAI
|
| 7 |
from langchain_groq import ChatGroq
|
| 8 |
#from langchain import HuggingFaceHub
|
| 9 |
|
paraphrase_post.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
from langchain_community.document_loaders import WebBaseLoader
|
| 2 |
from langchain.prompts import ChatPromptTemplate
|
| 3 |
from langchain.output_parsers import ResponseSchema
|
| 4 |
from langchain.output_parsers import StructuredOutputParser
|
|
@@ -45,6 +44,7 @@ def get_original_url(url):
|
|
| 45 |
def paraphrased_post(url,model):
|
| 46 |
|
| 47 |
post=scrappost(url)
|
|
|
|
| 48 |
|
| 49 |
template="""You are a helpful paraphraser tool. You are provided with a content and your task is to paraphrase it.
|
| 50 |
{data}"""
|
|
|
|
|
|
|
| 1 |
from langchain.prompts import ChatPromptTemplate
|
| 2 |
from langchain.output_parsers import ResponseSchema
|
| 3 |
from langchain.output_parsers import StructuredOutputParser
|
|
|
|
| 44 |
def paraphrased_post(url,model):
|
| 45 |
|
| 46 |
post=scrappost(url)
|
| 47 |
+
print(post)
|
| 48 |
|
| 49 |
template="""You are a helpful paraphraser tool. You are provided with a content and your task is to paraphrase it.
|
| 50 |
{data}"""
|
scrap_post.py
CHANGED
|
@@ -1,33 +1,13 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
|
| 5 |
|
| 6 |
def scrappost(url):
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
for element in glossary_page:
|
| 15 |
-
if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
|
| 16 |
-
# If there's already content in the group, add it to all_groups
|
| 17 |
-
if group['page_content']:
|
| 18 |
-
all_groups.append(group)
|
| 19 |
-
group = {'page_content': ''}
|
| 20 |
-
group['page_content'] += element.text
|
| 21 |
-
if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
|
| 22 |
-
group['page_content'] += element.text
|
| 23 |
-
|
| 24 |
-
if "unstructured.documents.html.HTMLListItem" in str(type(element)):
|
| 25 |
-
group['page_content']+=element.text
|
| 26 |
-
|
| 27 |
-
# # Add the last group if it exists
|
| 28 |
-
if group['page_content']:
|
| 29 |
-
all_groups.append(group)
|
| 30 |
-
|
| 31 |
-
# Print the groups
|
| 32 |
-
for group in all_groups[:1]:
|
| 33 |
-
return group["page_content"]
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
|
| 5 |
|
| 6 |
def scrappost(url):
|
| 7 |
+
response = requests.get(url)
|
| 8 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 9 |
+
span_tags = soup.find_all('script',type="application/ld+json")
|
| 10 |
+
content_list = [tag.get_text() for tag in span_tags]
|
| 11 |
+
for content in content_list:
|
| 12 |
+
data=json.loads(content)['articleBody']
|
| 13 |
+
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|