Spaces:
Sleeping
Sleeping
fixed embedding function
Browse files
app.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
| 3 |
from openai import AzureOpenAI
|
| 4 |
-
from sentence_transformers import SentenceTransformer
|
| 5 |
from langchain_community.document_loaders import PyPDFLoader
|
| 6 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_community.vectorstores import Chroma
|
|
|
|
| 8 |
|
| 9 |
# Load PDF (Tiruvāsagam)
|
| 10 |
loader = PyPDFLoader("tiru.pdf")
|
|
@@ -15,27 +15,24 @@ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
|
| 15 |
chunks = splitter.split_documents(docs)
|
| 16 |
|
| 17 |
# Local embedding model (Tamil capable)
|
| 18 |
-
embedding_model =
|
| 19 |
-
def embed(texts): return embedding_model.encode(texts, convert_to_numpy=True)
|
| 20 |
|
| 21 |
# Store in Chroma
|
| 22 |
-
vectorstore = Chroma.from_documents(chunks,
|
| 23 |
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})
|
| 24 |
|
| 25 |
# Azure OpenAI client
|
| 26 |
client = AzureOpenAI(
|
| 27 |
-
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
| 28 |
api_version="2025-01-01-preview",
|
| 29 |
-
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
|
| 30 |
)
|
| 31 |
|
| 32 |
# Chat function
|
| 33 |
def chat_fn(message, history):
|
| 34 |
-
# Retrieve relevant chunks
|
| 35 |
docs = retriever.get_relevant_documents(message)
|
| 36 |
context = "\n\n".join([d.page_content for d in docs])
|
| 37 |
|
| 38 |
-
# Call Azure OpenAI (GPT-4)
|
| 39 |
completion = client.chat.completions.create(
|
| 40 |
model="gpt-4.1", # your Azure deployment name
|
| 41 |
messages=[
|
|
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
| 3 |
from openai import AzureOpenAI
|
|
|
|
| 4 |
from langchain_community.document_loaders import PyPDFLoader
|
| 5 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 6 |
from langchain_community.vectorstores import Chroma
|
| 7 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 8 |
|
| 9 |
# Load PDF (Tiruvāsagam)
|
| 10 |
loader = PyPDFLoader("tiru.pdf")
|
|
|
|
| 15 |
chunks = splitter.split_documents(docs)
|
| 16 |
|
| 17 |
# Local embedding model (Tamil capable)
|
| 18 |
+
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
|
|
|
|
| 19 |
|
| 20 |
# Store in Chroma
|
| 21 |
+
vectorstore = Chroma.from_documents(chunks, embedding=embedding_model)
|
| 22 |
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})
|
| 23 |
|
| 24 |
# Azure OpenAI client
|
| 25 |
client = AzureOpenAI(
|
| 26 |
+
api_key=os.getenv("AZURE_OPENAI_API_KEY").strip(),
|
| 27 |
api_version="2025-01-01-preview",
|
| 28 |
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT").strip()
|
| 29 |
)
|
| 30 |
|
| 31 |
# Chat function
|
| 32 |
def chat_fn(message, history):
|
|
|
|
| 33 |
docs = retriever.get_relevant_documents(message)
|
| 34 |
context = "\n\n".join([d.page_content for d in docs])
|
| 35 |
|
|
|
|
| 36 |
completion = client.chat.completions.create(
|
| 37 |
model="gpt-4.1", # your Azure deployment name
|
| 38 |
messages=[
|