File size: 2,757 Bytes
d0f1307
 
 
 
 
3cf4930
d0f1307
de1ac3b
 
 
 
 
 
d0f1307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import streamlit as st
from docx import Document
from transformers import pipeline
from langdetect import detect
import spacy
import os

# Check if the spaCy model exists, if not, download it
model_name = "en_core_web_sm"
if not spacy.util.is_package(model_name):
    os.system(f"python -m spacy download {model_name}")

nlp = spacy.load(model_name)

# Load Llama 3 summarization model
llama_summarizer = pipeline("summarization", model="meta-llama/Meta-Llama-3-8B")

# Load Gemma 2-9B-IT for recommendations
gemma_recommender = pipeline("text-generation", model="google/gemma-2b-it")

# Function to extract text from a DOCX file
def extract_text_from_docx(docx_file):
    doc = Document(docx_file)
    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return text

# Function to detect document language
def detect_language(text):
    return detect(text)

# Function to extract metadata
def extract_metadata(text):
    doc = nlp(text)
    word_count = len(text.split())
    entities = {ent.label_: ent.text for ent in doc.ents}
    return {
        "Word Count": word_count,
        "Entities": entities
    }

# Function to generate abstract (summary) using Llama 3
def generate_summary(text):
    summary = llama_summarizer(text, max_length=200, min_length=50, do_sample=False)
    return summary[0]['summary_text']

# Function to generate recommendations using Gemma 2-9B-IT
def generate_recommendations(text):
    prompt = f"Provide three key recommendations based on the following document:\n{text}\n\nRecommendations:"
    recommendations = gemma_recommender(prompt, max_length=300, num_return_sequences=1, do_sample=False)
    return recommendations[0]['generated_text']

# Streamlit UI
st.title("πŸ“„ AI-Powered Multi-Language Document Analyzer")

uploaded_file = st.file_uploader("Upload a Word Document", type=["docx"])

if uploaded_file:
    st.success("File uploaded successfully!")

    # Extract text
    doc_text = extract_text_from_docx(uploaded_file)
    
    # Detect language
    language = detect_language(doc_text)
    
    # Extract metadata
    metadata = extract_metadata(doc_text)

    st.subheader("Extracted Text:")
    st.text_area("Document Content", doc_text, height=250)

    st.subheader("πŸ—£οΈ Detected Language:")
    st.write(language)

    st.subheader("πŸ“Š Metadata:")
    st.json(metadata)

    if st.button("Generate Abstract & Recommendations"):
        with st.spinner("Analyzing..."):
            summary = generate_summary(doc_text)
            recommendations = generate_recommendations(doc_text)

        st.subheader("πŸ“Œ Abstract (Summary) - Llama 3")
        st.write(summary)

        st.subheader("βœ… Recommendations - Gemma 2-9B-IT")
        st.write(recommendations)