From a22510ff6f73b5aaf61fe797199d7317ef396f26 Mon Sep 17 00:00:00 2001
From: Harish Karumuthil <harish2704@gmail.com>
Date: Sat, 8 Oct 2022 04:10:00 +0530
Subject: [PATCH] Added tests  and documentation

---
 .gitignore       |  6 ++--
 Readme.design.md | 83 ++++++++++++++++++++++++++++++++++++++++++++
 Readme.md        |  2 +-
 server.py        | 25 ++++++++++++--
 test.py          | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 201 insertions(+), 5 deletions(-)
 create mode 100644 Readme.design.md
 create mode 100644 test.py
diff --git a/.gitignore b/.gitignore
index acb4030..80b43cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
-lang-detect-MNB.pkl
-dataset.csv
+*.pkl
+*.csv
+__pycache__
+.bash_history
diff --git a/Readme.design.md b/Readme.design.md
new file mode 100644
index 0000000..458efa3
--- /dev/null
+++ b/Readme.design.md
@@ -0,0 +1,83 @@
+
+# Design considerations
+
+## Assumptions
+1. Task is to build a general purpose language detection system
+    * Since there were no specific requirement to develop system for accuracy/performance , an assumption is made to develop general purpose system
+    * There will be trade-off between accuracy , performance and resource consumption 
+2. The system should be transparent - meaning it should be easy to trace how a particular results were made
+3. System should be easy to maintain / improve
+
+## Design choices
+1. Since the requirement is not to build an highly accurate system, considering resource consumption, transparency & maintainability Recurrent neural network based solutions are omitted.
+2. For a simple, transparent text classifier, Multinomial Naive Bayes based text classifier is selected.
+
+## Constrains
+1. Availability of dataset.
+    * Number of languages supported and quality of prediction greatly depends on quality of the training dataset.
+    * As there were no specific requirements regarding the list of languages to be supported and the dataset was not provided, it was decided to limit the number of languages to those available in any quality public dataset available.
+
+
+# Architecture
+
+### Api server
+``` mermaid
+graph TD
+    A[Muti-thraded uvicorn server] -->|Request| B(Falcon app)
+    B -->|Request| C[ Schema based input validation ]
+    C -->|validated request| D[Controller]
+    D -->|text| E[Language detector]
+
+```
+
+### Language detector
+
+```mermaid
+graph TD
+    A[Controller] -->|Text| B[ Tokenizer/feature extraction 'CountVectorizer' ]
+    B -->|Token array| C[Probabilistic predictor 'MultinomialNB' ]
+    C -->|predictions| D(Return the name of language corresponding <br> to highest prediction probability)
+
+```
+
+
+# Accuracy
+
+Accuracy was tested using an entirely different dataset and its results are show below
+```
+          total  failed  success_percent
+Chinese     500     485              3.0
+Japanese    500     459              8.2
+Hindi       500      19             96.2
+Urdu        500      18             96.4
+Turkish     500      13             97.4
+Dutch       500      12             97.6
+Thai        500       5             99.0
+Arabic      500       4             99.2
+Spanish     500       3             99.4
+English     500       1             99.8
+Russian     500       0            100.0
+French      500       0            100.0
+```
+
+From the test results, it is clear that, this model is not suitable for detecting Chinese / Japanese kind of languages.
+It is because, they have very large number characters and hence number of possible words will be even bigger.
+Unless we train our model with very large corpus ( which will result model with large size and high memory usage ) it can not detect Chinese / Japanese family of languages
+For correctly detecting these languages we have following options
+1. Check the unicode characters in the text and if majority are Chinese / Japanese , skip model based prediction and respond with results
+2. LSTM based models will be able make prediction more accurately by analyzing character sequences. Try to implement such solution
+
+# Performance
+Simple apache benchmark utility shows following statistics on machine with "Intel(R) Core(TM) i5-3340M CPU @ 2.70GHz (4 threads)" & 8GB RAM
+```
+Concurrency         Req/sec             time/actual-req (ms)
+10                  96.12               0104.032
+50                  98.33               0508.481
+100                 98.43               1015.908
+```
+
+# Api documentation
+
+Api documentation is auto generated by API server itself using OpenAPI spec.
+it can be accessed either from http://127.0.0.1:8000/apidoc/swagger ( Swagger UI ) or http://127.0.0.1:8000/apidoc/redoc ( Redocly UI)
+An HTML documentation generated from OpenAPI spec is also included in the project directory for the reference
diff --git a/Readme.md b/Readme.md
index 960f15f..39d0900 100644
--- a/Readme.md
+++ b/Readme.md
@@ -48,7 +48,7 @@ pip3 install -r ./requirements.txt
 ## checking API response from terminal
 
 ```bash
-curl -X POST  -H "Accept: application/json"  -H "Content-Type: application/json"  "http://localhost:8000/api/language-detect"  -d '{"text": "Hello how old are you"}'
+curl -X POST  -H "Accept: application/json"  -H "Content-Type: application/json"  "http://localhost:8000/api/language/predict"  -d '{"text": "Hello how old are you"}'
 
 # {"lang": "English", "score": 0.9998260997236165}
 ```
diff --git a/server.py b/server.py
index 2ec809f..7989a55 100644
--- a/server.py
+++ b/server.py
@@ -7,9 +7,12 @@ logging.basicConfig(level=logging.DEBUG)
 import falcon
 import falcon.asgi
 from pydantic import BaseModel, Field
+from typing import List
 from spectree import Response, SpecTree, Tag
 
-from language_detector import predict
+from language_detector import predict, model
+
+SUPPORTED_LANGS = model.classes_.tolist()
 
 log = logging.getLogger("server")
 
@@ -37,6 +40,17 @@ class LanguageDetectionRequest(BaseModel):
         }
 
 
+class AvailableLanguagesResponse(BaseModel):
+    supported_languages: List[str]
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "supported_languages": ["English", "Hindi"],
+            }
+        }
+
+
 class LanguageDetectionResponse(BaseModel):
     lang: str
     score: float = Field(gt=0, le=1, description="Probability score of the detection")
@@ -65,7 +79,14 @@ class LanguageDetection:
         pred = predict(req.context.json.text)
         resp.media = {"lang": pred[1][0], "score": pred[0][0]}
 
+    @api.validate(resp=Response(HTTP_200=AvailableLanguagesResponse))
+    async def on_get(self, req, resp):
+        """
+        Get a list of supported languages
+        """
+        resp.media = {"supported_languages": SUPPORTED_LANGS}
+
 
 app = falcon.asgi.App()
-app.add_route("/api/language-detect", LanguageDetection())
+app.add_route("/api/language/predict", LanguageDetection())
 api.register(app)
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..759c951
--- /dev/null
+++ b/test.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+import os
+import requests
+import pandas as pd
+import numpy as np
+from iso639 import languages
+from asyncio_pool import AioPool
+import asyncio as aio
+
+API_URL = "http://localhost:8000"
+
+
+def test_get_supported_languages():
+
+    response = requests.get(API_URL + "/api/language/predict")
+    assert response.status_code == 200
+    resp_body = response.json()
+    assert isinstance(resp_body["supported_languages"], list)
+    assert isinstance(resp_body["supported_languages"][0], str)
+
+
+def test_post_language_detection():
+    response = requests.post(
+        API_URL + "/api/language/predict", json={"text": "Testing this application"}
+    )
+    assert response.status_code == 200
+    resp_body = response.json()
+    assert resp_body["lang"] == "English"
+
+
+def getSupportedLanguages():
+    allLangs = requests.get(API_URL + "/api/language/predict").json()
+    allLangs = allLangs["supported_languages"]
+    return allLangs
+
+
+def getTestData():
+    cacheFile = "./test.csv"
+    testDataUrl = "https://huggingface.co/datasets/papluca/language-identification/raw/main/test.csv"
+
+    supportedLangs = getSupportedLanguages()
+    if os.path.exists(cacheFile):
+        data = pd.read_csv(cacheFile)
+    else:
+        data = pd.read_csv(testDataUrl)
+        data.to_csv(cacheFile)
+    data.rename(columns={"labels": "language", "text": "Text"}, inplace=True)
+    data["language"] = data["language"].apply(lambda code: languages.part1[code].name)
+    data = data[data["language"].isin(supportedLangs)]
+    return data
+
+
+async def task(row):
+    row = row[1]
+    response = requests.post(
+        API_URL + "/api/language/predict", json={"text": row["Text"]}
+    )
+    resp_body = response.json()
+    return [resp_body, row]
+
+
+async def evaluateAccuracy():
+    data = getTestData()
+    stats = {}
+    failed = []
+    pool = AioPool(10)
+
+    results = await pool.map(task, data.iterrows())
+    print('complted %d requests ' % len(data))
+    for [resp_body, row] in results:
+        lang = row["language"]
+        if lang not in stats:
+            stats[lang] = {"total": 0, "failed": 0}
+        if not resp_body["lang"] == lang:
+            stats[lang]["failed"] += 1
+            failed.append(row)
+
+        stats[lang]["total"] += 1
+
+    stats = pd.DataFrame(stats).T
+    stats["success_percent"] = (1 - (stats["failed"] / stats["total"])) * 100.0
+    stats.sort_values("success_percent", ascending=False, inplace=True)
+    print(stats)
+    return stats
+
+
+if __name__ == "__main__":
+    loop = aio.new_event_loop()
+    loop.run_until_complete(evaluateAccuracy())