Spaces:

Jobanpreet
/

LinkedInpost

Sleeping

jobanpreet123 commited on Apr 27, 2024

Commit

11dec1a

1 Parent(s): 2f84696

scrapping code changed

Files changed (7) hide show

__pycache__/advance_post.cpython-310.pyc ADDED Viewed

Binary file (3.46 kB). View file

__pycache__/paraphrase_post.cpython-310.pyc ADDED Viewed

Binary file (2.96 kB). View file

__pycache__/scrap_post.cpython-310.pyc ADDED Viewed

Binary file (717 Bytes). View file

advance_post.py CHANGED Viewed

@@ -10,7 +10,7 @@ import nest_asyncio
 def google_search(linkedin_post,model , google_api_key, search_engine_id , num_results_per_query=[3,2,1]):
     response_schemas = [
-    ResponseSchema(name="answer", description="These are the top three relevant questions from the LinkedIn post" , type="list")]
     output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
     format_instructions = output_parser.get_format_instructions()
@@ -29,7 +29,7 @@ def google_search(linkedin_post,model , google_api_key, search_engine_id , num_r
     chain = prompt | model | output_parser
     result=chain.invoke({"post": linkedin_post})
-    questions=result['answer']
     # print(questions)
     all_links = []
@@ -61,7 +61,7 @@ def google_search(linkedin_post,model , google_api_key, search_engine_id , num_r
 #     result=chain.invoke({'post':linkedinpost , 'content':docs})
 #     return result , docs
 def advanced_post(all_links ,model ,linkedinpost):
     loader = WebBaseLoader(all_links,encoding="utf-8")
     loader.requests_per_second = 1

 def google_search(linkedin_post,model , google_api_key, search_engine_id , num_results_per_query=[3,2,1]):
     response_schemas = [
+    ResponseSchema(name="questions", description="These are the top three relevant questions from the LinkedIn post" , type="list")]
     output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
     format_instructions = output_parser.get_format_instructions()
     chain = prompt | model | output_parser
     result=chain.invoke({"post": linkedin_post})
+    questions=result['questions']
     # print(questions)
     all_links = []
 #     result=chain.invoke({'post':linkedinpost , 'content':docs})
 #     return result , docs
+nest_asyncio.apply()
 def advanced_post(all_links ,model ,linkedinpost):
     loader = WebBaseLoader(all_links,encoding="utf-8")
     loader.requests_per_second = 1

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import re
 import openai
 from paraphrase_post import get_original_url , paraphrased_post
 from advance_post import google_search , advanced_post
-from langchain.chat_models import ChatOpenAI
 from langchain_groq import ChatGroq
 #from langchain import HuggingFaceHub

 import openai
 from paraphrase_post import get_original_url , paraphrased_post
 from advance_post import google_search , advanced_post
+from langchain_community.chat_models import ChatOpenAI
 from langchain_groq import ChatGroq
 #from langchain import HuggingFaceHub

paraphrase_post.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from langchain_community.document_loaders import WebBaseLoader
 from langchain.prompts import ChatPromptTemplate
 from langchain.output_parsers import ResponseSchema
 from langchain.output_parsers import StructuredOutputParser
@@ -45,6 +44,7 @@ def get_original_url(url):
 def paraphrased_post(url,model):
     post=scrappost(url)
     template="""You are a helpful paraphraser tool. You are provided with a content and your task is to paraphrase it.
                 {data}"""

 from langchain.prompts import ChatPromptTemplate
 from langchain.output_parsers import ResponseSchema
 from langchain.output_parsers import StructuredOutputParser
 def paraphrased_post(url,model):
     post=scrappost(url)
+    print(post)
     template="""You are a helpful paraphraser tool. You are provided with a content and your task is to paraphrase it.
                 {data}"""

scrap_post.py CHANGED Viewed

@@ -1,33 +1,13 @@
-from unstructured.partition.html import partition_html
-#source = 'https://www.linkedin.com/posts/jobanpreet-singh-392581207_asr-whisper-speechrecognition-activity-7172803455718158336-MC-j?utm_source=share&utm_medium=member_desktop'
 def scrappost(url):
-    all_groups = []
-    group = {'page_content': ''}
-    # ingest and preprocess webpage into Unstructured elements object
-    glossary_page = partition_html(url=url)
-    # iterate the document elements and group texts by title
-    for element in glossary_page:
-        if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
-        # If there's already content in the group, add it to all_groups
-            if group['page_content']:
-                all_groups.append(group)
-                group = {'page_content': ''}
-                group['page_content'] += element.text
-        if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
-            group['page_content'] += element.text
-        if "unstructured.documents.html.HTMLListItem" in str(type(element)):
-            group['page_content']+=element.text
-    # # Add the last group if it exists
-    if group['page_content']:
-        all_groups.append(group)
-    # Print the groups
-    for group in all_groups[:1]:
-        return group["page_content"]

+import requests
+import json
+from bs4 import BeautifulSoup
 def scrappost(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    span_tags = soup.find_all('script',type="application/ld+json")
+    content_list = [tag.get_text() for tag in span_tags]
+    for content in content_list:
+        data=json.loads(content)['articleBody']
+    return data