Spaces:

Atreyu4EVR
/

Multi-OpenSource

Build error

App Files Files Community

Atreyu4EVR commited on Aug 6, 2024

Commit

50636d8

verified ·

1 Parent(s): 96d0c1a

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -16

app.py CHANGED Viewed

@@ -77,24 +77,57 @@ def setup_advanced_rag_pipeline(model_name):
     # Set up language model
     llm = HuggingFaceHub(repo_id=model_links[model_name], model_kwargs={"temperature": 0.5, "max_length": 4000})
-    # Set up HyDE
-    hyde_prompt = PromptTemplate(
-        input_variables=["question"],
-        template="Please write a passage to answer the question\nQuestion: {question}\nPassage:"
-    )
-    hyde_chain = LLMChain(llm=llm, prompt=hyde_prompt)
-    def hyde_retriever(query):
-        hypothetical_doc = hyde_chain.run(query)
-        hyde_embedding = embeddings.embed_query(hypothetical_doc)
-        return vectorstore.similarity_search_by_vector(hyde_embedding, k=3)
-    # Set up ContextualCompressionRetriever
-    compressor = LLMChainExtractor.from_llm(llm)
-    compression_retriever = ContextualCompressionRetriever(
-        base_compressor=compressor,
-        base_retriever=hyde_retriever
-    )
     # Create RetrievalQA chain
     qa_chain = RetrievalQA.from_chain_type(

     # Set up language model
     llm = HuggingFaceHub(repo_id=model_links[model_name], model_kwargs={"temperature": 0.5, "max_length": 4000})
+    def load_and_process_json(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    documents = data.get("documents", [])
+    if not documents:
+        raise ValueError("No valid documents found in JSON file.")
+    # Create Document objects
+    doc_objects = [
+        Document(
+            page_content=doc["content"],
+            metadata={"title": doc["title"], "id": doc["id"]}
+        ) for doc in documents
+    ]
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+    splits = text_splitter.split_documents(doc_objects)
+    return splits
+    def get_vectorstore(file_path):
+        # Check if vectorstore already exists
+        if os.path.exists(VECTORSTORE_PATH):
+            print("Loading existing vectorstore...")
+            return Chroma(persist_directory=VECTORSTORE_PATH, embedding_function=embeddings)
+        print("Creating new vectorstore...")
+        splits = load_and_process_json(file_path)
+        # Process in batches
+        vectorstore = None
+        for i in tqdm(range(0, len(splits), BATCH_SIZE), desc="Processing batches"):
+            batch = splits[i:i+BATCH_SIZE]
+            if vectorstore is None:
+                vectorstore = Chroma.from_documents(documents=batch, embedding=embeddings, persist_directory=VECTORSTORE_PATH)
+            else:
+                vectorstore.add_documents(documents=batch)
+        vectorstore.persist()
+        return vectorstore
+    def setup_rag_pipeline(file_path):
+        vectorstore = get_vectorstore(file_path)
+        return RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=vectorstore.as_retriever(search_kwargs={"k": RETRIEVER_K}),
+            return_source_documents=True
+        )
     # Create RetrievalQA chain
     qa_chain = RetrievalQA.from_chain_type(