Spaces:
Sleeping
Sleeping
File size: 3,779 Bytes
0b42653 0c9476c 0b42653 d3ab78b 0b42653 d3ab78b 0b42653 d3ab78b 0b42653 d3ab78b 0b42653 d3ab78b 64cadc5 d3ab78b 0c9476c d3ab78b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader,UnstructuredWordDocumentLoader,TextLoader,UnstructuredHTMLLoader,UnstructuredMarkdownLoader
import os
from logger import logger
def add_file_to_chroma(file_path, file_id, hugging_face_ef, db, logger):
"""Add file chunks to ChromaDB with advanced document handling."""
extension = file_path.split(".")[-1].lower()
loader_map = {
"pdf": PyPDFLoader,
"docx": UnstructuredWordDocumentLoader,
"txt": TextLoader,
"html": UnstructuredHTMLLoader,
"md": UnstructuredMarkdownLoader,
}
if extension not in loader_map:
raise ValueError(f"Unsupported file type: {extension}")
try:
# Load document using appropriate loader
loader = loader_map[extension](file_path)
documents = loader.load()
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500,
chunk_overlap=200,
length_function=len,
add_start_index=True
)
texts = text_splitter.split_documents(documents)
# Add metadata
for text in texts:
text.metadata.update({
"file_id": str(file_id),
"file_name": os.path.basename(file_path),
"file_type": extension
})
# Save to ChromaDB
db.add_documents(texts, embedding=hugging_face_ef)
# Clean up uploaded file
if os.path.exists(file_path):
os.remove(file_path)
logger.info(f"Added file '{file_path}' to ChromaDB")
return True
except Exception as e:
logger.error(f"Error processing file {file_path}: {str(e)}")
if os.path.exists(file_path):
os.remove(file_path)
raise e
def remove_file_from_chroma(file_id, db):
"""Remove file chunks from ChromaDB."""
try:
# Get chunks for file_id
results = db.get(where={"file_id": str(file_id)})
if results and results['ids']:
db.delete(ids=results['ids'])
return True
return False
except Exception as e:
logger.error(f"Error removing file from ChromaDB: {str(e)}")
return False
def generate_query_response(query, db, llm_model, PROMPT_TEMPLATE):
"""Generate response for a query using the documents in ChromaDB."""
try:
# Search for relevant documents with scores
top_related = db.similarity_search_with_relevance_scores(query, k=4)
# Check relevance of top result
is_relevant = top_related[0][1] >= 0.4 if top_related else False
# Build context from relevant chunks
context = "\n".join([chunk[0].page_content for chunk in top_related])
# Generate response using the LLM
prompt = PROMPT_TEMPLATE.format(context=context, query=query)
answer = llm_model.generate_content(prompt).text
logger.info(f"Query : {query}\nResponse:{answer}")
# Prepare response with sources
return {
"is_relevant": is_relevant,
"answer": answer,
"sources": [{
"page_content": chunk[0].page_content,
"score": chunk[1],
"metadata": chunk[0].metadata
} for chunk in top_related]
}
except Exception as e:
logger.error(f"Error generating response: {str(e)}")
return {
"is_relevant": False,
"answer": "An error occurred while processing your query.",
"error": str(e)
}
|