Spaces:
Sleeping
Sleeping
Create utils.py
Browse files
utils.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import fitz # PyMuPDF
|
| 5 |
+
import faiss
|
| 6 |
+
import numpy as np
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
import hashlib
|
| 9 |
+
|
| 10 |
+
# تنظیمات API
|
| 11 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 12 |
+
|
| 13 |
+
# Embedder
|
| 14 |
+
def get_embedding(text, model="text-embedding-ada-002"):
|
| 15 |
+
from openai import OpenAI
|
| 16 |
+
client = OpenAI(api_key=openai_api_key)
|
| 17 |
+
response = client.embeddings.create(input=[text], model=model)
|
| 18 |
+
return response.data[0].embedding
|
| 19 |
+
|
| 20 |
+
# استخراج متن از PDF و بردارسازی
|
| 21 |
+
def extract_text_and_vectors(files):
|
| 22 |
+
documents = []
|
| 23 |
+
for file in files:
|
| 24 |
+
doc = fitz.open(file.name)
|
| 25 |
+
for page_num, page in enumerate(doc):
|
| 26 |
+
text = page.get_text()
|
| 27 |
+
if text.strip():
|
| 28 |
+
vector = get_embedding(text)
|
| 29 |
+
documents.append({
|
| 30 |
+
"file_name": file.name,
|
| 31 |
+
"page_num": page_num + 1,
|
| 32 |
+
"text": text,
|
| 33 |
+
"vector": np.array(vector).astype("float32"),
|
| 34 |
+
})
|
| 35 |
+
return documents
|
| 36 |
+
|
| 37 |
+
# ساخت ایندکس FAISS
|
| 38 |
+
def build_faiss_index(documents, dim=1536):
|
| 39 |
+
index = faiss.IndexFlatL2(dim)
|
| 40 |
+
vectors = [doc["vector"] for doc in documents]
|
| 41 |
+
index.add(np.array(vectors))
|
| 42 |
+
return index
|
| 43 |
+
|
| 44 |
+
# جستجو در FAISS
|
| 45 |
+
def search_similar_content(query, documents, index, k=3):
|
| 46 |
+
query_vector = np.array(get_embedding(query)).astype("float32").reshape(1, -1)
|
| 47 |
+
D, I = index.search(query_vector, k)
|
| 48 |
+
results = [documents[i] for i in I[0]]
|
| 49 |
+
return results
|
| 50 |
+
|
| 51 |
+
# فرمتدهی پاسخ
|
| 52 |
+
def format_response(results):
|
| 53 |
+
formatted = []
|
| 54 |
+
for r in results:
|
| 55 |
+
snippet = r["text"][:500].strip().replace('\n', ' ')
|
| 56 |
+
formatted.append(f"""📄 **{r['file_name']}** | صفحه {r['page_num']}\n{text_shorten(snippet)}\n""")
|
| 57 |
+
return "\n---\n".join(formatted)
|
| 58 |
+
|
| 59 |
+
# کمکتابع برای خلاصه کردن متن
|
| 60 |
+
def text_shorten(text, max_chars=300):
|
| 61 |
+
return text if len(text) <= max_chars else text[:max_chars] + "..."
|
| 62 |
+
|
| 63 |
+
# لاگ نمونهای از اسناد پردازششده
|
| 64 |
+
def log_debug_info(documents, max_samples=2):
|
| 65 |
+
info = f"📦 مجموع اسناد پردازششده: {len(documents)}\n\n"
|
| 66 |
+
for i, doc in enumerate(documents[:max_samples]):
|
| 67 |
+
info += f"📝 فایل: {doc['file_name']} | صفحه: {doc['page_num']}\n"
|
| 68 |
+
info += f"متن نمونه: {text_shorten(doc['text'])}\n\n"
|
| 69 |
+
return info
|