File size: 3,779 Bytes
0b42653
 
 
0c9476c
0b42653
d3ab78b
 
 
0b42653
 
 
 
 
 
 
d3ab78b
0b42653
 
 
d3ab78b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b42653
d3ab78b
 
 
 
 
0b42653
d3ab78b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64cadc5
d3ab78b
 
 
 
 
 
 
 
0c9476c
 
d3ab78b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader,UnstructuredWordDocumentLoader,TextLoader,UnstructuredHTMLLoader,UnstructuredMarkdownLoader
import os 
from logger import logger

def add_file_to_chroma(file_path, file_id, hugging_face_ef, db, logger):
    """Add file chunks to ChromaDB with advanced document handling."""
    extension = file_path.split(".")[-1].lower()
    loader_map = {
        "pdf": PyPDFLoader,
        "docx": UnstructuredWordDocumentLoader,
        "txt": TextLoader,
        "html": UnstructuredHTMLLoader,
        "md": UnstructuredMarkdownLoader,
    }
    
    if extension not in loader_map:
        raise ValueError(f"Unsupported file type: {extension}")
    
    try:
        # Load document using appropriate loader
        loader = loader_map[extension](file_path)
        documents = loader.load()
        
        # Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1500,
            chunk_overlap=200,
            length_function=len,
            add_start_index=True
        )
        texts = text_splitter.split_documents(documents)
        
        # Add metadata
        for text in texts:
            text.metadata.update({
                "file_id": str(file_id),
                "file_name": os.path.basename(file_path),
                "file_type": extension
            })
        
        # Save to ChromaDB
        db.add_documents(texts, embedding=hugging_face_ef)
        
        # Clean up uploaded file
        if os.path.exists(file_path):
            os.remove(file_path)
            
        logger.info(f"Added file '{file_path}' to ChromaDB")
        return True
        
    except Exception as e:
        logger.error(f"Error processing file {file_path}: {str(e)}")
        if os.path.exists(file_path):
            os.remove(file_path)
        raise e

def remove_file_from_chroma(file_id, db):
    """Remove file chunks from ChromaDB."""
    try:
        # Get chunks for file_id
        results = db.get(where={"file_id": str(file_id)})
        
        if results and results['ids']:
            db.delete(ids=results['ids'])
            return True
        return False
        
    except Exception as e:
        logger.error(f"Error removing file from ChromaDB: {str(e)}")
        return False

def generate_query_response(query, db, llm_model, PROMPT_TEMPLATE):
    """Generate response for a query using the documents in ChromaDB."""
    try:
        # Search for relevant documents with scores
        top_related = db.similarity_search_with_relevance_scores(query, k=4)
        
        # Check relevance of top result
        is_relevant = top_related[0][1] >= 0.4 if top_related else False
        
        # Build context from relevant chunks
        context = "\n".join([chunk[0].page_content for chunk in top_related])
        
        # Generate response using the LLM
        prompt = PROMPT_TEMPLATE.format(context=context, query=query)
        answer = llm_model.generate_content(prompt).text
        
        logger.info(f"Query : {query}\nResponse:{answer}")
        
        # Prepare response with sources
        return {
            "is_relevant": is_relevant,
            "answer": answer,
            "sources": [{
                "page_content": chunk[0].page_content,
                "score": chunk[1],
                "metadata": chunk[0].metadata
            } for chunk in top_related]
        }
        
    except Exception as e:
        logger.error(f"Error generating response: {str(e)}")
        return {
            "is_relevant": False,
            "answer": "An error occurred while processing your query.",
            "error": str(e)
        }