Spaces:
Sleeping
Sleeping
Commit ·
79d5a5c
1
Parent(s): f6a9f63
eski holat
Browse files- app.py +1 -0
- index_retriever.py +1 -26
- utils.py +6 -0
app.py
CHANGED
|
@@ -103,6 +103,7 @@ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
|
|
| 103 |
from llama_index.core.text_splitter import TokenTextSplitter
|
| 104 |
|
| 105 |
embed_model = get_embedding_model()
|
|
|
|
| 106 |
llm = get_llm_model(DEFAULT_MODEL)
|
| 107 |
reranker = get_reranker_model()
|
| 108 |
|
|
|
|
| 103 |
from llama_index.core.text_splitter import TokenTextSplitter
|
| 104 |
|
| 105 |
embed_model = get_embedding_model()
|
| 106 |
+
|
| 107 |
llm = get_llm_model(DEFAULT_MODEL)
|
| 108 |
reranker = get_reranker_model()
|
| 109 |
|
index_retriever.py
CHANGED
|
@@ -12,32 +12,7 @@ def create_vector_index(documents):
|
|
| 12 |
log_message("Строю векторный индекс")
|
| 13 |
return VectorStoreIndex.from_documents(documents)
|
| 14 |
|
| 15 |
-
|
| 16 |
-
"""Deduplicate retrieved nodes based on unique identifiers"""
|
| 17 |
-
seen = set()
|
| 18 |
-
unique_nodes = []
|
| 19 |
-
|
| 20 |
-
for node in nodes:
|
| 21 |
-
# Create unique identifier from metadata
|
| 22 |
-
doc_id = node.metadata.get('document_id', '')
|
| 23 |
-
section_id = node.metadata.get('section_id', '')
|
| 24 |
-
chunk_id = node.metadata.get('chunk_id', 0)
|
| 25 |
-
node_type = node.metadata.get('type', 'text')
|
| 26 |
-
|
| 27 |
-
if node_type == 'table':
|
| 28 |
-
table_num = node.metadata.get('table_number', '')
|
| 29 |
-
identifier = f"{doc_id}|table|{table_num}|{chunk_id}"
|
| 30 |
-
elif node_type == 'image':
|
| 31 |
-
img_num = node.metadata.get('image_number', '')
|
| 32 |
-
identifier = f"{doc_id}|image|{img_num}"
|
| 33 |
-
else:
|
| 34 |
-
identifier = f"{doc_id}|{section_id}|{chunk_id}"
|
| 35 |
-
|
| 36 |
-
if identifier not in seen:
|
| 37 |
-
seen.add(identifier)
|
| 38 |
-
unique_nodes.append(node)
|
| 39 |
-
|
| 40 |
-
return unique_nodes
|
| 41 |
|
| 42 |
def create_query_engine(vector_index):
|
| 43 |
try:
|
|
|
|
| 12 |
log_message("Строю векторный индекс")
|
| 13 |
return VectorStoreIndex.from_documents(documents)
|
| 14 |
|
| 15 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def create_query_engine(vector_index):
|
| 18 |
try:
|
utils.py
CHANGED
|
@@ -2,6 +2,12 @@ from llama_index.llms.google_genai import GoogleGenAI
|
|
| 2 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 3 |
from sentence_transformers import CrossEncoder
|
| 4 |
from my_logging import log_message
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def get_llm_model(api_key, model_name="gemini-2.0-flash"):
|
| 7 |
return GoogleGenAI(model=model_name, api_key=api_key)
|
|
|
|
| 2 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 3 |
from sentence_transformers import CrossEncoder
|
| 4 |
from my_logging import log_message
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
api_key = os.getenv('GOOGLE_API_KEY') # or however you're loading it
|
| 8 |
+
if not api_key:
|
| 9 |
+
raise ValueError("GOOGLE_API_KEY not found in environment")
|
| 10 |
+
|
| 11 |
|
| 12 |
def get_llm_model(api_key, model_name="gemini-2.0-flash"):
|
| 13 |
return GoogleGenAI(model=model_name, api_key=api_key)
|