From 8a5d9cab5d199c817b1997adeb619d2aae192b1f Mon Sep 17 00:00:00 2001 From: Harish Karumuthil Date: Sat, 8 Oct 2022 03:07:00 +0530 Subject: [PATCH] Initial commit --- .gitignore | 2 ++ Readme.md | 54 ++++++++++++++++++++++++++++++ language_detector.py | 78 ++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 8 +++++ run-server.sh | 3 ++ server.py | 71 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 216 insertions(+) create mode 100644 .gitignore create mode 100644 Readme.md create mode 100644 language_detector.py create mode 100644 requirements.txt create mode 100755 run-server.sh create mode 100644 server.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..acb4030 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +lang-detect-MNB.pkl +dataset.csv diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..960f15f --- /dev/null +++ b/Readme.md @@ -0,0 +1,54 @@ + + +## Prerequisite +* python3 + + +## Installation + +To avoid local package pollution, it would be better to create virtual environment for this installation + +```bash +pip3 install virtualenv + +# Create a virtual env in ./venv directory +virtualenv ./venv + +# Activate virtualenv +. ./venv/bin/activate +``` + +Then install dependencies + +```bash +pip3 install -r ./requirements.txt +``` + +## Running server + +```bash +./run-server.sh + +# INFO:language-detector:Training model ... +# INFO:language-detector:Downloading dataset from url ... +# INFO:language-detector:Saved dataset to cache ... +# INFO:language-detector:Training complete. Feature count: 6109840 +# INFO:language-detector:saved model lang-detect-MNB.pkl +# INFO: Started server process [1780455] +# INFO:uvicorn.error:Started server process [1780455] +# INFO: Waiting for application startup. +# INFO:uvicorn.error:Waiting for application startup. +# INFO: Application startup complete. +# INFO:uvicorn.error:Application startup complete. +# INFO: Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit) +# INFO:uvicorn.error:Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit) + +``` + +## checking API response from terminal + +```bash +curl -X POST -H "Accept: application/json" -H "Content-Type: application/json" "http://localhost:8000/api/language-detect" -d '{"text": "Hello how old are you"}' + +# {"lang": "English", "score": 0.9998260997236165} +``` diff --git a/language_detector.py b/language_detector.py new file mode 100644 index 0000000..332ede7 --- /dev/null +++ b/language_detector.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python + +import os +import pickle +import iso639 +import pandas as pd +import numpy as np +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import MultinomialNB +from logging import getLogger + + +log = getLogger("language-detector") + +modelFilename = "lang-detect-MNB.pkl" +datasetFilename = "./dataset.csv" + + +def saveModel(model, cv): + pickle.dump([model, cv], open(modelFilename, "wb")) + log.info("saved model %s" % modelFilename) + + +def loadModel(): + loaded_model, loaded_cv = pickle.load(open(modelFilename, "rb")) + log.info("loaded model %s" % modelFilename) + return loaded_model, loaded_cv + + +def getModel(): + if os.path.exists(modelFilename): + return loadModel() + model, cv = trainModel() + saveModel(model, cv) + return model, cv + + +def getDataType1(): + if os.path.exists(datasetFilename): + log.info("loading cached dataset %s" % datasetFilename) + data = pd.read_csv(datasetFilename) + else: + log.info("Downloading dataset from url ...") + data = pd.read_csv( + "https://raw.githubusercontent.com/amankharwal/Website-data/master/dataset.csv" + ) + data.to_csv(datasetFilename) + log.info("Saved dataset to cache ...") + data.isnull().sum() + data["language"].value_counts() + return data + + +def trainModel(): + log.info("Training model ...") + cv = CountVectorizer() + model = MultinomialNB() + + data = getDataType1() + x = np.array(data["Text"]) + y = np.array(data["language"]) + + X = cv.fit_transform(x) + model.fit(X, y) + log.info("Training complete. Feature count: %d" % model.feature_count_.size) + return model, cv + + +model, cv = getModel() + + +def predict(text): + data = cv.transform([text]).toarray() + pred = model.predict_proba(data).reshape(-1) + pred = pd.DataFrame(zip(pred, model.classes_)) + pred = pred.sort_values(0, ascending=False)[:3].to_dict("list") + return pred diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a7e4ede --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +uvicorn==0.18.3 +falcon==3.1.0 +pydantic==1.10.2 +spectree==0.10.6 +iso-639==0.4.5 +numpy==1.23.3 +pandas==1.5.0 +sklearn==0.0 diff --git a/run-server.sh b/run-server.sh new file mode 100755 index 0000000..8b45ded --- /dev/null +++ b/run-server.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +uvicorn server:app $@ diff --git a/server.py b/server.py new file mode 100644 index 0000000..2ec809f --- /dev/null +++ b/server.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +import logging + +logging.basicConfig(level=logging.DEBUG) + +import falcon +import falcon.asgi +from pydantic import BaseModel, Field +from spectree import Response, SpecTree, Tag + +from language_detector import predict + +log = logging.getLogger("server") + +api = SpecTree( + "falcon-asgi", + title="Language detection service", + version="0.0.1", + description="Detect language of given text", + contact={ + "name": "Harish", + "email": "harish2704@gmail.com", + "url": "https://github.com/harish2704", + }, +) + + +class LanguageDetectionRequest(BaseModel): + text: str + + class Config: + schema_extra = { + "example": { + "text": "How are you?", + } + } + + +class LanguageDetectionResponse(BaseModel): + lang: str + score: float = Field(gt=0, le=1, description="Probability score of the detection") + + class Config: + schema_extra = { + "example": { + "lang": "English", + "score": 0.993, + } + } + + +class LanguageDetection: + """ + Language detection demo + """ + + @api.validate( + json=LanguageDetectionRequest, resp=Response(HTTP_200=LanguageDetectionResponse) + ) + async def on_post(self, req, resp): + """ + Detect language of given text + """ + pred = predict(req.context.json.text) + resp.media = {"lang": pred[1][0], "score": pred[0][0]} + + +app = falcon.asgi.App() +app.add_route("/api/language-detect", LanguageDetection()) +api.register(app)