commit
8a5d9cab5d
6 changed files with 216 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||||
|
lang-detect-MNB.pkl |
||||
|
dataset.csv |
@ -0,0 +1,54 @@ |
|||||
|
|
||||
|
|
||||
|
## Prerequisite |
||||
|
* python3 |
||||
|
|
||||
|
|
||||
|
## Installation |
||||
|
|
||||
|
To avoid local package pollution, it would be better to create virtual environment for this installation |
||||
|
|
||||
|
```bash |
||||
|
pip3 install virtualenv |
||||
|
|
||||
|
# Create a virtual env in ./venv directory |
||||
|
virtualenv ./venv |
||||
|
|
||||
|
# Activate virtualenv |
||||
|
. ./venv/bin/activate |
||||
|
``` |
||||
|
|
||||
|
Then install dependencies |
||||
|
|
||||
|
```bash |
||||
|
pip3 install -r ./requirements.txt |
||||
|
``` |
||||
|
|
||||
|
## Running server |
||||
|
|
||||
|
```bash |
||||
|
./run-server.sh |
||||
|
|
||||
|
# INFO:language-detector:Training model ... |
||||
|
# INFO:language-detector:Downloading dataset from url ... |
||||
|
# INFO:language-detector:Saved dataset to cache ... |
||||
|
# INFO:language-detector:Training complete. Feature count: 6109840 |
||||
|
# INFO:language-detector:saved model lang-detect-MNB.pkl |
||||
|
# INFO: Started server process [1780455] |
||||
|
# INFO:uvicorn.error:Started server process [1780455] |
||||
|
# INFO: Waiting for application startup. |
||||
|
# INFO:uvicorn.error:Waiting for application startup. |
||||
|
# INFO: Application startup complete. |
||||
|
# INFO:uvicorn.error:Application startup complete. |
||||
|
# INFO: Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit) |
||||
|
# INFO:uvicorn.error:Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit) |
||||
|
|
||||
|
``` |
||||
|
|
||||
|
## checking API response from terminal |
||||
|
|
||||
|
```bash |
||||
|
curl -X POST -H "Accept: application/json" -H "Content-Type: application/json" "http://localhost:8000/api/language-detect" -d '{"text": "Hello how old are you"}' |
||||
|
|
||||
|
# {"lang": "English", "score": 0.9998260997236165} |
||||
|
``` |
@ -0,0 +1,78 @@ |
|||||
|
#!/usr/bin/env python |
||||
|
|
||||
|
import os |
||||
|
import pickle |
||||
|
import iso639 |
||||
|
import pandas as pd |
||||
|
import numpy as np |
||||
|
from sklearn.feature_extraction.text import CountVectorizer |
||||
|
from sklearn.model_selection import train_test_split |
||||
|
from sklearn.naive_bayes import MultinomialNB |
||||
|
from logging import getLogger |
||||
|
|
||||
|
|
||||
|
log = getLogger("language-detector") |
||||
|
|
||||
|
modelFilename = "lang-detect-MNB.pkl" |
||||
|
datasetFilename = "./dataset.csv" |
||||
|
|
||||
|
|
||||
|
def saveModel(model, cv): |
||||
|
pickle.dump([model, cv], open(modelFilename, "wb")) |
||||
|
log.info("saved model %s" % modelFilename) |
||||
|
|
||||
|
|
||||
|
def loadModel(): |
||||
|
loaded_model, loaded_cv = pickle.load(open(modelFilename, "rb")) |
||||
|
log.info("loaded model %s" % modelFilename) |
||||
|
return loaded_model, loaded_cv |
||||
|
|
||||
|
|
||||
|
def getModel(): |
||||
|
if os.path.exists(modelFilename): |
||||
|
return loadModel() |
||||
|
model, cv = trainModel() |
||||
|
saveModel(model, cv) |
||||
|
return model, cv |
||||
|
|
||||
|
|
||||
|
def getDataType1(): |
||||
|
if os.path.exists(datasetFilename): |
||||
|
log.info("loading cached dataset %s" % datasetFilename) |
||||
|
data = pd.read_csv(datasetFilename) |
||||
|
else: |
||||
|
log.info("Downloading dataset from url ...") |
||||
|
data = pd.read_csv( |
||||
|
"https://raw.githubusercontent.com/amankharwal/Website-data/master/dataset.csv" |
||||
|
) |
||||
|
data.to_csv(datasetFilename) |
||||
|
log.info("Saved dataset to cache ...") |
||||
|
data.isnull().sum() |
||||
|
data["language"].value_counts() |
||||
|
return data |
||||
|
|
||||
|
|
||||
|
def trainModel(): |
||||
|
log.info("Training model ...") |
||||
|
cv = CountVectorizer() |
||||
|
model = MultinomialNB() |
||||
|
|
||||
|
data = getDataType1() |
||||
|
x = np.array(data["Text"]) |
||||
|
y = np.array(data["language"]) |
||||
|
|
||||
|
X = cv.fit_transform(x) |
||||
|
model.fit(X, y) |
||||
|
log.info("Training complete. Feature count: %d" % model.feature_count_.size) |
||||
|
return model, cv |
||||
|
|
||||
|
|
||||
|
model, cv = getModel() |
||||
|
|
||||
|
|
||||
|
def predict(text): |
||||
|
data = cv.transform([text]).toarray() |
||||
|
pred = model.predict_proba(data).reshape(-1) |
||||
|
pred = pd.DataFrame(zip(pred, model.classes_)) |
||||
|
pred = pred.sort_values(0, ascending=False)[:3].to_dict("list") |
||||
|
return pred |
@ -0,0 +1,8 @@ |
|||||
|
uvicorn==0.18.3 |
||||
|
falcon==3.1.0 |
||||
|
pydantic==1.10.2 |
||||
|
spectree==0.10.6 |
||||
|
iso-639==0.4.5 |
||||
|
numpy==1.23.3 |
||||
|
pandas==1.5.0 |
||||
|
sklearn==0.0 |
@ -0,0 +1,3 @@ |
|||||
|
#!/usr/bin/env bash |
||||
|
|
||||
|
uvicorn server:app $@ |
@ -0,0 +1,71 @@ |
|||||
|
#!/usr/bin/env python |
||||
|
|
||||
|
import logging |
||||
|
|
||||
|
logging.basicConfig(level=logging.DEBUG) |
||||
|
|
||||
|
import falcon |
||||
|
import falcon.asgi |
||||
|
from pydantic import BaseModel, Field |
||||
|
from spectree import Response, SpecTree, Tag |
||||
|
|
||||
|
from language_detector import predict |
||||
|
|
||||
|
log = logging.getLogger("server") |
||||
|
|
||||
|
api = SpecTree( |
||||
|
"falcon-asgi", |
||||
|
title="Language detection service", |
||||
|
version="0.0.1", |
||||
|
description="Detect language of given text", |
||||
|
contact={ |
||||
|
"name": "Harish", |
||||
|
"email": "harish2704@gmail.com", |
||||
|
"url": "https://github.com/harish2704", |
||||
|
}, |
||||
|
) |
||||
|
|
||||
|
|
||||
|
class LanguageDetectionRequest(BaseModel): |
||||
|
text: str |
||||
|
|
||||
|
class Config: |
||||
|
schema_extra = { |
||||
|
"example": { |
||||
|
"text": "How are you?", |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
|
||||
|
class LanguageDetectionResponse(BaseModel): |
||||
|
lang: str |
||||
|
score: float = Field(gt=0, le=1, description="Probability score of the detection") |
||||
|
|
||||
|
class Config: |
||||
|
schema_extra = { |
||||
|
"example": { |
||||
|
"lang": "English", |
||||
|
"score": 0.993, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
|
||||
|
class LanguageDetection: |
||||
|
""" |
||||
|
Language detection demo |
||||
|
""" |
||||
|
|
||||
|
@api.validate( |
||||
|
json=LanguageDetectionRequest, resp=Response(HTTP_200=LanguageDetectionResponse) |
||||
|
) |
||||
|
async def on_post(self, req, resp): |
||||
|
""" |
||||
|
Detect language of given text |
||||
|
""" |
||||
|
pred = predict(req.context.json.text) |
||||
|
resp.media = {"lang": pred[1][0], "score": pred[0][0]} |
||||
|
|
||||
|
|
||||
|
app = falcon.asgi.App() |
||||
|
app.add_route("/api/language-detect", LanguageDetection()) |
||||
|
api.register(app) |
Loading…
Reference in new issue