Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,7 @@ from flask import jsonify
|
|
| 15 |
load_dotenv()
|
| 16 |
|
| 17 |
API_URL_EMBEDDINGS = f"https://e4e5-196-96-202-255.ngrok-free.app/embeddings"
|
|
|
|
| 18 |
|
| 19 |
# FAISS index setup
|
| 20 |
DIM = 768 # Adjust based on the embedding model
|
|
@@ -72,6 +73,10 @@ def store_document_data(PDF_FILE):
|
|
| 72 |
def retrieve_document(query):
|
| 73 |
print(f"Retrieving document based on:\n{query}")
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# Generate query embedding
|
| 76 |
query_embedding = embedding_model.encode([query]).astype(np.float32)
|
| 77 |
|
|
@@ -95,7 +100,7 @@ def clean_text(text):
|
|
| 95 |
print("cleaning")
|
| 96 |
text = unicodedata.normalize("NFKC", text) # Normalize Unicode characters
|
| 97 |
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces and newlines
|
| 98 |
-
text = re.sub(r'[^a-zA-Z0-9.,!?;:\
|
| 99 |
text = re.sub(r'(?i)(page\s*\d+)', '', text) # Remove page numbers
|
| 100 |
return text
|
| 101 |
|
|
|
|
| 15 |
load_dotenv()
|
| 16 |
|
| 17 |
API_URL_EMBEDDINGS = f"https://e4e5-196-96-202-255.ngrok-free.app/embeddings"
|
| 18 |
+
API_URL_METADATA = f"https://e4e5-196-96-202-255.ngrok-free.app/metadata"
|
| 19 |
|
| 20 |
# FAISS index setup
|
| 21 |
DIM = 768 # Adjust based on the embedding model
|
|
|
|
| 73 |
def retrieve_document(query):
|
| 74 |
print(f"Retrieving document based on:\n{query}")
|
| 75 |
|
| 76 |
+
embeddings_file = response.get(API_URL_EMBEDDINGS)
|
| 77 |
+
metadata_file = response.get(API_URL_METADATA)
|
| 78 |
+
|
| 79 |
+
print(embeddings_file, metadata_file)
|
| 80 |
# Generate query embedding
|
| 81 |
query_embedding = embedding_model.encode([query]).astype(np.float32)
|
| 82 |
|
|
|
|
| 100 |
print("cleaning")
|
| 101 |
text = unicodedata.normalize("NFKC", text) # Normalize Unicode characters
|
| 102 |
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces and newlines
|
| 103 |
+
text = re.sub(r'[^a-zA-Z0-9.,!?;:\\"()\-]', ' ', text) # Keep basic punctuation
|
| 104 |
text = re.sub(r'(?i)(page\s*\d+)', '', text) # Remove page numbers
|
| 105 |
return text
|
| 106 |
|