Spaces:
Sleeping
Sleeping
File size: 2,497 Bytes
74c6f59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | from langchain.vectorstores import LanceDB, Pinecone as pc
from langchain.llms import OpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.schema import Document
from pypdf import PdfReader
from langchain.llms.openai import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import HuggingFaceHub
import lancedb
#Extract Information from PDF file
def get_pdf_text(pdf_doc):
text = ""
pdf_reader = PdfReader(pdf_doc)
for page in pdf_reader.pages:
text += page.extract_text()
return text
# iterate over files in
# that user uploaded PDF files, one by one
def create_docs(user_pdf_list, unique_id):
docs=[]
# each pdf file is a doc. We are creating our own Document with it
for filename in user_pdf_list:
chunks=get_pdf_text(filename)
#Adding items to our list - Adding data & its metadata
docs.append(Document(
page_content=chunks,
metadata={"name": filename.name,"id":filename.file_id,"type=":filename.type,"size":filename.size,"unique_id":unique_id},
))
return docs # Document chunks list
#Create embeddings instance
def create_embeddings_load_data():
#embeddings = OpenAIEmbeddings()
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
return embeddings
# push to lancedb
def push_to_lancedb(embeddings):
db = lancedb.connect("/lancedb")
# sample data which will be overwritten when ne dos are uploaded
table = db.create_table(
"resumes",
data=[
{
"vector": embeddings.embed_query("Hello World"),
"text": "Hello World",
"id": "1",
}
],
mode="overwrite",
)
return table
def pull_from_lancedb(table, embeddings, docs):
docsearch = LanceDB.from_documents(documents=docs, embedding = embeddings, connection=table)
return docsearch
def similar_docs_lancedb(query, table, embeddings, docs):
docsearch = pull_from_lancedb(table, embeddings, docs)
similar_docs = docsearch.similarity_search(query)
return similar_docs
# Helps us get the summary of a document
def get_summary(current_doc):
llm = OpenAI(temperature=0)
#llm = HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":1e-10})
chain = load_summarize_chain(llm, chain_type="map_reduce")
summary = chain.run([current_doc])
return summary
|