Sidoine1991 commited on
Commit ·
3e9d6c3
1
Parent(s): 2809368
Initial commit CoranIA: ajout app, index, README et index zip
Browse files- README.md +72 -12
- app.py +109 -0
- chroma_db_coran.zip +3 -0
- index_coran.py +61 -0
- requirements.txt +6 -0
README.md
CHANGED
|
@@ -1,14 +1,74 @@
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
|
|
|
| 1 |
---
|
| 2 |
+
# CoranIA 📖🤖
|
| 3 |
+
|
| 4 |
+
CoranIA est une application RAG (Retrieval-Augmented Generation) pour poser des questions sur le Coran en français.
|
| 5 |
+
Elle utilise :
|
| 6 |
+
- ChromaDB pour l’index documentaire.
|
| 7 |
+
- Google Gemini pour la génération de réponses naturelles.
|
| 8 |
+
- Streamlit pour l’interface.
|
| 9 |
+
|
| 10 |
+
## Fonctionnalités
|
| 11 |
+
|
| 12 |
+
- Recherche sémantique dans le Coran grâce à ChromaDB.
|
| 13 |
+
- Génération de réponses en langage naturel avec Gemini.
|
| 14 |
+
- Boutons de partage (WhatsApp, Facebook).
|
| 15 |
+
- Extraction automatique de l’index zip.
|
| 16 |
+
- Interface simple et responsive.
|
| 17 |
+
|
| 18 |
+
## Structure du projet
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
CoranIA/
|
| 22 |
+
├─ app.py # Application Streamlit
|
| 23 |
+
├─ index_coran.py # Script d’indexation du PDF du Coran
|
| 24 |
+
├─ coran_french.pdf # PDF source du Coran
|
| 25 |
+
├─ chroma_db_coran/ # Dossier d’index ChromaDB (après indexation)
|
| 26 |
+
├─ chroma_db_coran.zip # Archive de l’index (upload sur HF)
|
| 27 |
+
├─ .env # Variables d’environnement (API Keys)
|
| 28 |
+
├─ requirements.txt # Dépendances Python
|
| 29 |
+
└─ README.md # Documentation du projet
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
## Installation et exécution locales
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
git clone https://huggingface.co/spaces/Sidoineko/CoranIA
|
| 36 |
+
cd CoranIA
|
| 37 |
+
|
| 38 |
+
# Créer et activer un venv
|
| 39 |
+
python -m venv venv
|
| 40 |
+
venv\Scripts\activate # Windows
|
| 41 |
+
# source venv/bin/activate # Linux/Mac
|
| 42 |
+
|
| 43 |
+
# Installer les dépendances
|
| 44 |
+
pip install -r requirements.txt
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### Configurer la clé API
|
| 48 |
+
|
| 49 |
+
Créer un fichier `.env` à la racine :
|
| 50 |
+
```ini
|
| 51 |
+
GOOGLE_API_KEY=...
|
| 52 |
+
GEMINI_API_KEY_2=...
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
### Générer l’index (si non déjà présent)
|
| 56 |
+
```bash
|
| 57 |
+
python index_coran.py
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### Lancer l’application
|
| 61 |
+
```bash
|
| 62 |
+
streamlit run app.py
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Déploiement sur Hugging Face Spaces
|
| 66 |
+
|
| 67 |
+
1. Zipper le dossier d’index : `chroma_db_coran/` → `chroma_db_coran.zip`
|
| 68 |
+
2. Uploader `chroma_db_coran.zip` dans le repo du Space (Dataset).
|
| 69 |
+
3. Pusher les sources (`app.py`, `index_coran.py`, `requirements.txt`, `README.md`).
|
| 70 |
+
4. Le Space se déploie automatiquement.
|
| 71 |
+
|
| 72 |
+
## License
|
| 73 |
|
| 74 |
+
MIT License
|
app.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zipfile
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import chromadb
|
| 5 |
+
from chromadb.utils import embedding_functions
|
| 6 |
+
import google.generativeai as genai
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
import urllib.parse
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
# --- Configurations ---
|
| 14 |
+
PERSIST_DIRECTORY = "chroma_db_coran"
|
| 15 |
+
INDEX_ZIP_PATH = "chroma_db_coran.zip"
|
| 16 |
+
DB_COLLECTION_NAME = "coran_rag_collection"
|
| 17 |
+
EMBEDDING_MODEL_NAME = "models/text-embedding-004"
|
| 18 |
+
LLM_MODEL_NAME = "models/gemini-1.5-flash-latest"
|
| 19 |
+
|
| 20 |
+
# --- UI Header ---
|
| 21 |
+
try:
|
| 22 |
+
st.image("logo_coran.jpg", width=120)
|
| 23 |
+
except:
|
| 24 |
+
pass
|
| 25 |
+
st.markdown("<h1 style='text-align:center; color:#2d8659;'>❓ CoranIA</h1>", unsafe_allow_html=True)
|
| 26 |
+
st.markdown("<p style='text-align:center; color:#444;'>📖 Posez une question sur le Coran</p>", unsafe_allow_html=True)
|
| 27 |
+
st.markdown("<hr>", unsafe_allow_html=True)
|
| 28 |
+
|
| 29 |
+
# --- Sidebar Configuration ---
|
| 30 |
+
with st.sidebar:
|
| 31 |
+
st.header("⚙️ Configuration")
|
| 32 |
+
n_results = st.slider("Nombre de passages à utiliser", 1, 5, 3)
|
| 33 |
+
|
| 34 |
+
# --- Ensure Index Exists ---
|
| 35 |
+
if not os.path.exists(PERSIST_DIRECTORY):
|
| 36 |
+
st.warning(f"🗂️ Index local non trouvé. Extraction de {INDEX_ZIP_PATH}...")
|
| 37 |
+
try:
|
| 38 |
+
with zipfile.ZipFile(INDEX_ZIP_PATH, "r") as z:
|
| 39 |
+
z.extractall(PERSIST_DIRECTORY)
|
| 40 |
+
st.success("✅ Index extrait avec succès.")
|
| 41 |
+
except Exception as e:
|
| 42 |
+
st.error(f"❌ Erreur lors de l'extraction de l'index : {e}")
|
| 43 |
+
st.stop()
|
| 44 |
+
|
| 45 |
+
# --- API Key Setup ---
|
| 46 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY_2")
|
| 47 |
+
if not GOOGLE_API_KEY:
|
| 48 |
+
st.error("Clé API non trouvée. Définir GOOGLE_API_KEY ou GEMINI_API_KEY_2 dans .env ou secrets.")
|
| 49 |
+
st.stop()
|
| 50 |
+
|
| 51 |
+
genai.configure(api_key=GOOGLE_API_KEY)
|
| 52 |
+
|
| 53 |
+
# --- ChromaDB & Embedding ---
|
| 54 |
+
embed_fn = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
|
| 55 |
+
api_key=GOOGLE_API_KEY, model_name=EMBEDDING_MODEL_NAME
|
| 56 |
+
)
|
| 57 |
+
client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)
|
| 58 |
+
try:
|
| 59 |
+
collection = client.get_collection(name=DB_COLLECTION_NAME, embedding_function=embed_fn)
|
| 60 |
+
except:
|
| 61 |
+
collection = client.create_collection(name=DB_COLLECTION_NAME, embedding_function=embed_fn)
|
| 62 |
+
|
| 63 |
+
# --- Generative Model ---
|
| 64 |
+
@st.cache_resource
|
| 65 |
+
def get_generative_model(model_name):
|
| 66 |
+
try:
|
| 67 |
+
return genai.GenerativeModel(model_name)
|
| 68 |
+
except Exception as e:
|
| 69 |
+
st.error(f"Erreur initialisation modèle génératif: {e}")
|
| 70 |
+
return None
|
| 71 |
+
|
| 72 |
+
gen_model = get_generative_model(LLM_MODEL_NAME)
|
| 73 |
+
|
| 74 |
+
# --- Ask Function ---
|
| 75 |
+
def ask_coran(question: str):
|
| 76 |
+
if not question:
|
| 77 |
+
return None, None
|
| 78 |
+
q_emb = embed_fn([question])[0]
|
| 79 |
+
results = collection.query(query_embeddings=[q_emb], n_results=n_results)
|
| 80 |
+
docs = results.get("documents", [[]])[0]
|
| 81 |
+
context = "\n\n".join(docs)
|
| 82 |
+
prompt = f"Voici des extraits du Coran :\n{context}\n\nQuestion : {question}\nRéponse :"
|
| 83 |
+
response = gen_model.generate_content(prompt)
|
| 84 |
+
return response.text, context
|
| 85 |
+
|
| 86 |
+
# --- Main Interaction ---
|
| 87 |
+
question = st.text_input("Votre question sur le Coran", "")
|
| 88 |
+
if st.button("Envoyer"):
|
| 89 |
+
if not question:
|
| 90 |
+
st.warning("Entrez une question.")
|
| 91 |
+
else:
|
| 92 |
+
with st.spinner("Recherche et génération..."):
|
| 93 |
+
answer, context = ask_coran(question)
|
| 94 |
+
if answer:
|
| 95 |
+
st.success(answer)
|
| 96 |
+
with st.expander("Passages utilisés"):
|
| 97 |
+
st.write(context)
|
| 98 |
+
# --- Share Buttons ---
|
| 99 |
+
share_text = f"Réponse CoranIA : {answer}"
|
| 100 |
+
encoded = urllib.parse.quote_plus(share_text)
|
| 101 |
+
app_url = "https://huggingface.co/spaces/Sidoineko/CoranIA"
|
| 102 |
+
whatsapp_url = f"https://wa.me/?text={encoded}%20{app_url}"
|
| 103 |
+
facebook_url = f"https://www.facebook.com/sharer/sharer.php?u={app_url}"e={encoded}"
|
| 104 |
+
st.markdown(f"""
|
| 105 |
+
<div style='text-align:center; margin-top:1em;'>
|
| 106 |
+
<a href="{whatsapp_url}" target="_blank"><img src="https://img.icons8.com/color/48/000000/whatsapp--v1.png" width="32"/></a>
|
| 107 |
+
<a href="{facebook_url}" target="_blank"><img src="https://img.icons8.com/color/48/000000/facebook-new.png" width="32"/></a>
|
| 108 |
+
</div>
|
| 109 |
+
""", unsafe_allow_html=True)
|
chroma_db_coran.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc0b63553f9e7a5da3be19275b46318f1179d74e96afebc7f6a44910cc0abdc2
|
| 3 |
+
size 16543212
|
index_coran.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import chromadb
|
| 4 |
+
import PyPDF2
|
| 5 |
+
from chromadb.utils import embedding_functions
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
# --- Paramètres ---
|
| 10 |
+
PDF_PATH = "coran_french.pdf"
|
| 11 |
+
CHROMA_DIR = "chroma_db_coran"
|
| 12 |
+
COLLECTION_NAME = "coran_rag_collection"
|
| 13 |
+
EMBEDDING_MODEL = "models/text-embedding-004"
|
| 14 |
+
CHUNK_SIZE = 500 # caractères
|
| 15 |
+
|
| 16 |
+
def extract_chunks(pdf_path, chunk_size=500):
|
| 17 |
+
print(f"Lecture du PDF : {pdf_path}")
|
| 18 |
+
with open(pdf_path, "rb") as f:
|
| 19 |
+
reader = PyPDF2.PdfReader(f)
|
| 20 |
+
all_text = "\n".join(page.extract_text() or '' for page in reader.pages)
|
| 21 |
+
chunks = []
|
| 22 |
+
for i in range(0, len(all_text), chunk_size):
|
| 23 |
+
chunk = all_text[i:i+chunk_size]
|
| 24 |
+
if chunk.strip():
|
| 25 |
+
chunks.append(chunk)
|
| 26 |
+
print(f"Nombre de chunks générés : {len(chunks)}")
|
| 27 |
+
return chunks
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
if not os.path.exists(PDF_PATH):
|
| 31 |
+
print(f"ERREUR : Le fichier {PDF_PATH} est introuvable dans le dossier courant.")
|
| 32 |
+
exit(1)
|
| 33 |
+
|
| 34 |
+
print("--- Démarrage de l'indexation du Coran ---")
|
| 35 |
+
t0 = time.time()
|
| 36 |
+
chunks = extract_chunks(PDF_PATH, CHUNK_SIZE)
|
| 37 |
+
|
| 38 |
+
client = chromadb.PersistentClient(path=CHROMA_DIR)
|
| 39 |
+
if COLLECTION_NAME in [c.name for c in client.list_collections()]:
|
| 40 |
+
print(f"Suppression de l'ancienne collection '{COLLECTION_NAME}'...")
|
| 41 |
+
client.delete_collection(COLLECTION_NAME)
|
| 42 |
+
collection = client.create_collection(COLLECTION_NAME)
|
| 43 |
+
|
| 44 |
+
api_key = os.getenv("GEMINI_API_KEY_2=AIzaSyA1_mUCShfPMeCM23nE3gu1jKFJpB9S44U") or os.getenv("GEMINI_API_KEY_2")
|
| 45 |
+
if not api_key:
|
| 46 |
+
raise ValueError("Aucune clé API Gemini/Google n'a été trouvée dans GOOGLE_API_KEY ou GEMINI_API_KEY_2.")
|
| 47 |
+
embedding_fn = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
|
| 48 |
+
api_key=api_key, model_name=EMBEDDING_MODEL
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
print("Indexation des chunks dans ChromaDB...")
|
| 52 |
+
batch_size = 32
|
| 53 |
+
for i in range(0, len(chunks), batch_size):
|
| 54 |
+
batch = chunks[i:i+batch_size]
|
| 55 |
+
ids = [f"chunk_{i+j}" for j in range(len(batch))]
|
| 56 |
+
try:
|
| 57 |
+
collection.add(documents=batch, ids=ids, embeddings=embedding_fn(batch))
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"Erreur à l'indexation du batch {i//batch_size}: {e}")
|
| 60 |
+
print(f"Progression : {min(i+batch_size, len(chunks))}/{len(chunks)} chunks indexés")
|
| 61 |
+
print(f"Indexation terminée en {time.time()-t0:.1f} secondes.")
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
chromadb==0.4.24
|
| 2 |
+
PyPDF2==3.0.1
|
| 3 |
+
streamlit==1.32.2
|
| 4 |
+
chromadb[all]
|
| 5 |
+
requests
|
| 6 |
+
python-dotenv
|