|
|
import streamlit as st |
|
|
from docx import Document |
|
|
from transformers import pipeline |
|
|
from langdetect import detect |
|
|
import spacy |
|
|
import os |
|
|
|
|
|
|
|
|
model_name = "en_core_web_sm" |
|
|
if not spacy.util.is_package(model_name): |
|
|
os.system(f"python -m spacy download {model_name}") |
|
|
|
|
|
nlp = spacy.load(model_name) |
|
|
|
|
|
|
|
|
llama_summarizer = pipeline("summarization", model="meta-llama/Meta-Llama-3-8B") |
|
|
|
|
|
|
|
|
gemma_recommender = pipeline("text-generation", model="google/gemma-2b-it") |
|
|
|
|
|
|
|
|
def extract_text_from_docx(docx_file): |
|
|
doc = Document(docx_file) |
|
|
text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) |
|
|
return text |
|
|
|
|
|
|
|
|
def detect_language(text): |
|
|
return detect(text) |
|
|
|
|
|
|
|
|
def extract_metadata(text): |
|
|
doc = nlp(text) |
|
|
word_count = len(text.split()) |
|
|
entities = {ent.label_: ent.text for ent in doc.ents} |
|
|
return { |
|
|
"Word Count": word_count, |
|
|
"Entities": entities |
|
|
} |
|
|
|
|
|
|
|
|
def generate_summary(text): |
|
|
summary = llama_summarizer(text, max_length=200, min_length=50, do_sample=False) |
|
|
return summary[0]['summary_text'] |
|
|
|
|
|
|
|
|
def generate_recommendations(text): |
|
|
prompt = f"Provide three key recommendations based on the following document:\n{text}\n\nRecommendations:" |
|
|
recommendations = gemma_recommender(prompt, max_length=300, num_return_sequences=1, do_sample=False) |
|
|
return recommendations[0]['generated_text'] |
|
|
|
|
|
|
|
|
st.title("π AI-Powered Multi-Language Document Analyzer") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload a Word Document", type=["docx"]) |
|
|
|
|
|
if uploaded_file: |
|
|
st.success("File uploaded successfully!") |
|
|
|
|
|
|
|
|
doc_text = extract_text_from_docx(uploaded_file) |
|
|
|
|
|
|
|
|
language = detect_language(doc_text) |
|
|
|
|
|
|
|
|
metadata = extract_metadata(doc_text) |
|
|
|
|
|
st.subheader("Extracted Text:") |
|
|
st.text_area("Document Content", doc_text, height=250) |
|
|
|
|
|
st.subheader("π£οΈ Detected Language:") |
|
|
st.write(language) |
|
|
|
|
|
st.subheader("π Metadata:") |
|
|
st.json(metadata) |
|
|
|
|
|
if st.button("Generate Abstract & Recommendations"): |
|
|
with st.spinner("Analyzing..."): |
|
|
summary = generate_summary(doc_text) |
|
|
recommendations = generate_recommendations(doc_text) |
|
|
|
|
|
st.subheader("π Abstract (Summary) - Llama 3") |
|
|
st.write(summary) |
|
|
|
|
|
st.subheader("β
Recommendations - Gemma 2-9B-IT") |
|
|
st.write(recommendations) |
|
|
|