File size: 3,425 Bytes
2e7e624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cc0559
2e7e624
 
 
 
8736c1e
2e7e624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85408bf
 
2e7e624
85408bf
 
2e7e624
85408bf
 
2e7e624
85408bf
2e7e624
85408bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e7e624
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import uuid
from dotenv import load_dotenv
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import cohere

load_dotenv()


pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("mini-rag-project-1file")  # dimension=384


co = cohere.Client(os.getenv("COHERE_API_KEY"))


embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim


model_name = "google/flan-t5-small" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


def split_text(text, chunk_size=800, overlap=80):
    """
    Split text into chunks of ~chunk_size words with overlap.
    Adjust chunk_size & overlap as per requirement.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(" ".join(words[i:i + chunk_size]))
    return chunks

def ingest(text: str, source: str = "user"):
    """
    Ingest text into Pinecone:
    1. Chunk text
    2. Generate embeddings
    3. Upsert vectors with metadata (source, position, text)
    """
    chunks = split_text(text)
    embeddings = embed_model.encode(chunks)

    vectors = []
    for i, emb in enumerate(embeddings):
        vectors.append({
            "id": str(uuid.uuid4()),
            "values": emb.tolist(),
            "metadata": {
                "source": source,
                "position": i,
                "text": chunks[i]
            }
        })

    index.upsert(vectors)


def retrieve(query, top_k=10):
    """Retrieve top-k chunks from Pinecone"""
    query_vector = embed_model.encode(query).tolist()

    results = index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True
    )

    docs = []
    for match in results.get("matches", []):
        text = match.get("metadata", {}).get("text", "")
        docs.append({
            "id": match.get("id"),
            "text": text,
            "metadata": match.get("metadata", {}),
            "score": match.get("score", 0)
        })
    return docs

def rerank(query, docs, top_n=5):
    """Optional: Re-rank retrieved docs using Cohere"""
    if not co or not docs:
        return docs

    documents = [d["text"] for d in docs]
    response = co.rerank(
        model="rerank-english-v3.0",
        query=query,
        documents=documents,
        top_n=top_n
    )

    reranked_docs = []
    for r in response.results:
        doc = docs[r.index]
        doc["rerank_score"] = r.relevance_score
        reranked_docs.append(doc)

    return reranked_docs[:top_n]


def answer(query, docs):
    """
    Generate answer using full document as context.
    docs: list of strings OR list of dicts with 'text'
    """

    # Join entire document
    context_text = "\n\n".join(
        d["text"] if isinstance(d, dict) else d
        for d in docs
    )

    prompt = f"""
    Answer the question using ONLY the context below.

    Context:
    {context_text}

    Question:
    {query}

    Answer:
    """

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512   
    )

    outputs = model.generate(
        **inputs,
        max_length=200,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)