Browse Source

Initial commit

main
Harish Karumuthil 3 years ago
commit
8a5d9cab5d
  1. 2
      .gitignore
  2. 54
      Readme.md
  3. 78
      language_detector.py
  4. 8
      requirements.txt
  5. 3
      run-server.sh
  6. 71
      server.py

2
.gitignore

@ -0,0 +1,2 @@
lang-detect-MNB.pkl
dataset.csv

54
Readme.md

@ -0,0 +1,54 @@
## Prerequisite
* python3
## Installation
To avoid local package pollution, it would be better to create virtual environment for this installation
```bash
pip3 install virtualenv
# Create a virtual env in ./venv directory
virtualenv ./venv
# Activate virtualenv
. ./venv/bin/activate
```
Then install dependencies
```bash
pip3 install -r ./requirements.txt
```
## Running server
```bash
./run-server.sh
# INFO:language-detector:Training model ...
# INFO:language-detector:Downloading dataset from url ...
# INFO:language-detector:Saved dataset to cache ...
# INFO:language-detector:Training complete. Feature count: 6109840
# INFO:language-detector:saved model lang-detect-MNB.pkl
# INFO: Started server process [1780455]
# INFO:uvicorn.error:Started server process [1780455]
# INFO: Waiting for application startup.
# INFO:uvicorn.error:Waiting for application startup.
# INFO: Application startup complete.
# INFO:uvicorn.error:Application startup complete.
# INFO: Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
# INFO:uvicorn.error:Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
```
## checking API response from terminal
```bash
curl -X POST -H "Accept: application/json" -H "Content-Type: application/json" "http://localhost:8000/api/language-detect" -d '{"text": "Hello how old are you"}'
# {"lang": "English", "score": 0.9998260997236165}
```

78
language_detector.py

@ -0,0 +1,78 @@
#!/usr/bin/env python
import os
import pickle
import iso639
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from logging import getLogger
log = getLogger("language-detector")
modelFilename = "lang-detect-MNB.pkl"
datasetFilename = "./dataset.csv"
def saveModel(model, cv):
pickle.dump([model, cv], open(modelFilename, "wb"))
log.info("saved model %s" % modelFilename)
def loadModel():
loaded_model, loaded_cv = pickle.load(open(modelFilename, "rb"))
log.info("loaded model %s" % modelFilename)
return loaded_model, loaded_cv
def getModel():
if os.path.exists(modelFilename):
return loadModel()
model, cv = trainModel()
saveModel(model, cv)
return model, cv
def getDataType1():
if os.path.exists(datasetFilename):
log.info("loading cached dataset %s" % datasetFilename)
data = pd.read_csv(datasetFilename)
else:
log.info("Downloading dataset from url ...")
data = pd.read_csv(
"https://raw.githubusercontent.com/amankharwal/Website-data/master/dataset.csv"
)
data.to_csv(datasetFilename)
log.info("Saved dataset to cache ...")
data.isnull().sum()
data["language"].value_counts()
return data
def trainModel():
log.info("Training model ...")
cv = CountVectorizer()
model = MultinomialNB()
data = getDataType1()
x = np.array(data["Text"])
y = np.array(data["language"])
X = cv.fit_transform(x)
model.fit(X, y)
log.info("Training complete. Feature count: %d" % model.feature_count_.size)
return model, cv
model, cv = getModel()
def predict(text):
data = cv.transform([text]).toarray()
pred = model.predict_proba(data).reshape(-1)
pred = pd.DataFrame(zip(pred, model.classes_))
pred = pred.sort_values(0, ascending=False)[:3].to_dict("list")
return pred

8
requirements.txt

@ -0,0 +1,8 @@
uvicorn==0.18.3
falcon==3.1.0
pydantic==1.10.2
spectree==0.10.6
iso-639==0.4.5
numpy==1.23.3
pandas==1.5.0
sklearn==0.0

3
run-server.sh

@ -0,0 +1,3 @@
#!/usr/bin/env bash
uvicorn server:app $@

71
server.py

@ -0,0 +1,71 @@
#!/usr/bin/env python
import logging
logging.basicConfig(level=logging.DEBUG)
import falcon
import falcon.asgi
from pydantic import BaseModel, Field
from spectree import Response, SpecTree, Tag
from language_detector import predict
log = logging.getLogger("server")
api = SpecTree(
"falcon-asgi",
title="Language detection service",
version="0.0.1",
description="Detect language of given text",
contact={
"name": "Harish",
"email": "harish2704@gmail.com",
"url": "https://github.com/harish2704",
},
)
class LanguageDetectionRequest(BaseModel):
text: str
class Config:
schema_extra = {
"example": {
"text": "How are you?",
}
}
class LanguageDetectionResponse(BaseModel):
lang: str
score: float = Field(gt=0, le=1, description="Probability score of the detection")
class Config:
schema_extra = {
"example": {
"lang": "English",
"score": 0.993,
}
}
class LanguageDetection:
"""
Language detection demo
"""
@api.validate(
json=LanguageDetectionRequest, resp=Response(HTTP_200=LanguageDetectionResponse)
)
async def on_post(self, req, resp):
"""
Detect language of given text
"""
pred = predict(req.context.json.text)
resp.media = {"lang": pred[1][0], "score": pred[0][0]}
app = falcon.asgi.App()
app.add_route("/api/language-detect", LanguageDetection())
api.register(app)
Loading…
Cancel
Save