commit
8a5d9cab5d
6 changed files with 216 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||
lang-detect-MNB.pkl |
|||
dataset.csv |
@ -0,0 +1,54 @@ |
|||
|
|||
|
|||
## Prerequisite |
|||
* python3 |
|||
|
|||
|
|||
## Installation |
|||
|
|||
To avoid local package pollution, it would be better to create virtual environment for this installation |
|||
|
|||
```bash |
|||
pip3 install virtualenv |
|||
|
|||
# Create a virtual env in ./venv directory |
|||
virtualenv ./venv |
|||
|
|||
# Activate virtualenv |
|||
. ./venv/bin/activate |
|||
``` |
|||
|
|||
Then install dependencies |
|||
|
|||
```bash |
|||
pip3 install -r ./requirements.txt |
|||
``` |
|||
|
|||
## Running server |
|||
|
|||
```bash |
|||
./run-server.sh |
|||
|
|||
# INFO:language-detector:Training model ... |
|||
# INFO:language-detector:Downloading dataset from url ... |
|||
# INFO:language-detector:Saved dataset to cache ... |
|||
# INFO:language-detector:Training complete. Feature count: 6109840 |
|||
# INFO:language-detector:saved model lang-detect-MNB.pkl |
|||
# INFO: Started server process [1780455] |
|||
# INFO:uvicorn.error:Started server process [1780455] |
|||
# INFO: Waiting for application startup. |
|||
# INFO:uvicorn.error:Waiting for application startup. |
|||
# INFO: Application startup complete. |
|||
# INFO:uvicorn.error:Application startup complete. |
|||
# INFO: Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit) |
|||
# INFO:uvicorn.error:Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit) |
|||
|
|||
``` |
|||
|
|||
## checking API response from terminal |
|||
|
|||
```bash |
|||
curl -X POST -H "Accept: application/json" -H "Content-Type: application/json" "http://localhost:8000/api/language-detect" -d '{"text": "Hello how old are you"}' |
|||
|
|||
# {"lang": "English", "score": 0.9998260997236165} |
|||
``` |
@ -0,0 +1,78 @@ |
|||
#!/usr/bin/env python |
|||
|
|||
import os |
|||
import pickle |
|||
import iso639 |
|||
import pandas as pd |
|||
import numpy as np |
|||
from sklearn.feature_extraction.text import CountVectorizer |
|||
from sklearn.model_selection import train_test_split |
|||
from sklearn.naive_bayes import MultinomialNB |
|||
from logging import getLogger |
|||
|
|||
|
|||
log = getLogger("language-detector") |
|||
|
|||
modelFilename = "lang-detect-MNB.pkl" |
|||
datasetFilename = "./dataset.csv" |
|||
|
|||
|
|||
def saveModel(model, cv): |
|||
pickle.dump([model, cv], open(modelFilename, "wb")) |
|||
log.info("saved model %s" % modelFilename) |
|||
|
|||
|
|||
def loadModel(): |
|||
loaded_model, loaded_cv = pickle.load(open(modelFilename, "rb")) |
|||
log.info("loaded model %s" % modelFilename) |
|||
return loaded_model, loaded_cv |
|||
|
|||
|
|||
def getModel(): |
|||
if os.path.exists(modelFilename): |
|||
return loadModel() |
|||
model, cv = trainModel() |
|||
saveModel(model, cv) |
|||
return model, cv |
|||
|
|||
|
|||
def getDataType1(): |
|||
if os.path.exists(datasetFilename): |
|||
log.info("loading cached dataset %s" % datasetFilename) |
|||
data = pd.read_csv(datasetFilename) |
|||
else: |
|||
log.info("Downloading dataset from url ...") |
|||
data = pd.read_csv( |
|||
"https://raw.githubusercontent.com/amankharwal/Website-data/master/dataset.csv" |
|||
) |
|||
data.to_csv(datasetFilename) |
|||
log.info("Saved dataset to cache ...") |
|||
data.isnull().sum() |
|||
data["language"].value_counts() |
|||
return data |
|||
|
|||
|
|||
def trainModel(): |
|||
log.info("Training model ...") |
|||
cv = CountVectorizer() |
|||
model = MultinomialNB() |
|||
|
|||
data = getDataType1() |
|||
x = np.array(data["Text"]) |
|||
y = np.array(data["language"]) |
|||
|
|||
X = cv.fit_transform(x) |
|||
model.fit(X, y) |
|||
log.info("Training complete. Feature count: %d" % model.feature_count_.size) |
|||
return model, cv |
|||
|
|||
|
|||
model, cv = getModel() |
|||
|
|||
|
|||
def predict(text): |
|||
data = cv.transform([text]).toarray() |
|||
pred = model.predict_proba(data).reshape(-1) |
|||
pred = pd.DataFrame(zip(pred, model.classes_)) |
|||
pred = pred.sort_values(0, ascending=False)[:3].to_dict("list") |
|||
return pred |
@ -0,0 +1,8 @@ |
|||
uvicorn==0.18.3 |
|||
falcon==3.1.0 |
|||
pydantic==1.10.2 |
|||
spectree==0.10.6 |
|||
iso-639==0.4.5 |
|||
numpy==1.23.3 |
|||
pandas==1.5.0 |
|||
sklearn==0.0 |
@ -0,0 +1,3 @@ |
|||
#!/usr/bin/env bash |
|||
|
|||
uvicorn server:app $@ |
@ -0,0 +1,71 @@ |
|||
#!/usr/bin/env python |
|||
|
|||
import logging |
|||
|
|||
logging.basicConfig(level=logging.DEBUG) |
|||
|
|||
import falcon |
|||
import falcon.asgi |
|||
from pydantic import BaseModel, Field |
|||
from spectree import Response, SpecTree, Tag |
|||
|
|||
from language_detector import predict |
|||
|
|||
log = logging.getLogger("server") |
|||
|
|||
api = SpecTree( |
|||
"falcon-asgi", |
|||
title="Language detection service", |
|||
version="0.0.1", |
|||
description="Detect language of given text", |
|||
contact={ |
|||
"name": "Harish", |
|||
"email": "harish2704@gmail.com", |
|||
"url": "https://github.com/harish2704", |
|||
}, |
|||
) |
|||
|
|||
|
|||
class LanguageDetectionRequest(BaseModel): |
|||
text: str |
|||
|
|||
class Config: |
|||
schema_extra = { |
|||
"example": { |
|||
"text": "How are you?", |
|||
} |
|||
} |
|||
|
|||
|
|||
class LanguageDetectionResponse(BaseModel): |
|||
lang: str |
|||
score: float = Field(gt=0, le=1, description="Probability score of the detection") |
|||
|
|||
class Config: |
|||
schema_extra = { |
|||
"example": { |
|||
"lang": "English", |
|||
"score": 0.993, |
|||
} |
|||
} |
|||
|
|||
|
|||
class LanguageDetection: |
|||
""" |
|||
Language detection demo |
|||
""" |
|||
|
|||
@api.validate( |
|||
json=LanguageDetectionRequest, resp=Response(HTTP_200=LanguageDetectionResponse) |
|||
) |
|||
async def on_post(self, req, resp): |
|||
""" |
|||
Detect language of given text |
|||
""" |
|||
pred = predict(req.context.json.text) |
|||
resp.media = {"lang": pred[1][0], "score": pred[0][0]} |
|||
|
|||
|
|||
app = falcon.asgi.App() |
|||
app.add_route("/api/language-detect", LanguageDetection()) |
|||
api.register(app) |
Loading…
Reference in new issue