danicafisher commited on
Commit
8a474d4
·
verified ·
1 Parent(s): e3026b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -0
app.py CHANGED
@@ -2,11 +2,53 @@
2
  """
3
  IMPORTS HERE
4
  """
 
 
 
 
 
 
 
 
 
 
5
 
6
  ### Global Section ###
7
  """
8
  GLOBAL CODE HERE
9
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  ### On Chat Start (Session Start) Section ###
12
  @cl.on_chat_start
 
2
  """
3
  IMPORTS HERE
4
  """
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_community.document_loaders import PyMuPDFLoader
7
+ from qdrant_client import QdrantClient
8
+ from qdrant_client.http.models import Distance, VectorParams
9
+ from langchain_openai.embeddings import OpenAIEmbeddings
10
+ from langchain.storage import LocalFileStore
11
+ from langchain_qdrant import QdrantVectorStore
12
+ from langchain.embeddings import CacheBackedEmbeddings
13
+
14
+
15
 
16
  ### Global Section ###
17
  """
18
  GLOBAL CODE HERE
19
  """
20
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
21
+ Loader = PyMuPDFLoader
22
+ loader = Loader(file_path)
23
+ documents = loader.load()
24
+ docs = text_splitter.split_documents(documents)
25
+ for i, doc in enumerate(docs):
26
+ doc.metadata["source"] = f"source_{i}"
27
+
28
+ # Typical Embedding Model
29
+ core_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
30
+
31
+ # Typical QDrant Client Set-up
32
+ collection_name = f"pdf_to_parse_{uuid.uuid4()}"
33
+ client = QdrantClient(":memory:")
34
+ client.create_collection(
35
+ collection_name=collection_name,
36
+ vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
37
+ )
38
+
39
+ # Adding cache!
40
+ store = LocalFileStore("./cache/")
41
+ cached_embedder = CacheBackedEmbeddings.from_bytes_store(
42
+ core_embeddings, store, namespace=core_embeddings.model
43
+ )
44
+
45
+ # Typical QDrant Vector Store Set-up
46
+ vectorstore = QdrantVectorStore(
47
+ client=client,
48
+ collection_name=collection_name,
49
+ embedding=cached_embedder)
50
+ vectorstore.add_documents(docs)
51
+ retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3})
52
 
53
  ### On Chat Start (Session Start) Section ###
54
  @cl.on_chat_start