Module elastiknn.utils

Expand source code
from random import Random
from typing import List, Iterator, Union

import numpy as np
from scipy.sparse import csr_matrix

from elastiknn.api import Vec

_rng = Random(0)

valid_metrics_algos = [
    ('exact', 'l1'),
    ('exact', 'l2'),
    ('exact', 'cosine'),
    ('exact', 'hamming'),
    ('exact', 'jaccard'),
    ('lsh', 'l2'),
    ('lsh', 'cosine'),
    ('lsh', 'jaccard'),
    ('lsh', 'hamming'),
    ('permutation_lsh', 'cosine'),
    ('permutation_lsh', 'l2')
]

def dealias_metric(metric: str) -> str:
    mlower = metric.lower()
    if mlower == 'euclidean':
        return 'l2'
    elif mlower == 'angular':
        return 'cosine'
    else:
        return mlower



def sparse_bool_vectors_to_csr(sbvs: List[Vec.SparseBool]) -> csr_matrix:
    rows, cols, data = [], [], []
    for row, sbv in enumerate(sbvs):
        for col in sbv.true_indices:
            cols.append(col)
            rows.append(row)
            data.append(True)
    return csr_matrix((data, (rows, cols)), shape=(len(sbvs), sbvs[0].total_indices), dtype=bool)


def csr_to_sparse_bool_vectors(csr: csr_matrix) -> Iterator[Vec.SparseBool]:
    return map(lambda row: Vec.SparseBool(true_indices=list(row.indices), total_indices=row.shape[-1]), csr)


def float_vectors_to_ndarray(fvs: List[Vec.DenseFloat]) -> np.ndarray:
    arr = np.zeros(shape=(len(fvs), len(fvs[0].values)))
    for i, fv in enumerate(fvs):
        arr[i] = list(fv.values)
    return arr


def ndarray_to_dense_float_vectors(arr: np.ndarray) -> Iterator[Vec.DenseFloat]:
    return map(lambda row: Vec.DenseFloat(values=list(map(float, row))), arr)


def ndarray_to_sparse_bool_vectors(arr: np.ndarray) -> Iterator[Vec.SparseBool]:
    return map(lambda row: Vec.SparseBool(true_indices=list(map(int, np.where(row)[0])), total_indices=len(row)), arr)


def canonical_vectors_to_elastiknn(canonical: Union[np.ndarray, csr_matrix]) -> Iterator[Union[Vec.SparseBool, Vec.DenseFloat]]:
    if isinstance(canonical, np.ndarray):
        if canonical.dtype == bool:
            return ndarray_to_sparse_bool_vectors(canonical)
        else:
            return ndarray_to_dense_float_vectors(canonical)
    elif isinstance(canonical, csr_matrix):
        return csr_to_sparse_bool_vectors(canonical)
    elif isinstance(canonical, list) and isinstance(canonical[0], Vec.Base):
        return canonical
    raise TypeError(f"Expected a numpy array or a csr matrix but got {type(canonical)}")

Functions

def canonical_vectors_to_elastiknn(canonical: Union[numpy.ndarray, scipy.sparse._csr.csr_matrix]) -> Iterator[Union[Vec.SparseBoolVec.DenseFloat]]
Expand source code
def canonical_vectors_to_elastiknn(canonical: Union[np.ndarray, csr_matrix]) -> Iterator[Union[Vec.SparseBool, Vec.DenseFloat]]:
    if isinstance(canonical, np.ndarray):
        if canonical.dtype == bool:
            return ndarray_to_sparse_bool_vectors(canonical)
        else:
            return ndarray_to_dense_float_vectors(canonical)
    elif isinstance(canonical, csr_matrix):
        return csr_to_sparse_bool_vectors(canonical)
    elif isinstance(canonical, list) and isinstance(canonical[0], Vec.Base):
        return canonical
    raise TypeError(f"Expected a numpy array or a csr matrix but got {type(canonical)}")
def csr_to_sparse_bool_vectors(csr: scipy.sparse._csr.csr_matrix) -> Iterator[Vec.SparseBool]
Expand source code
def csr_to_sparse_bool_vectors(csr: csr_matrix) -> Iterator[Vec.SparseBool]:
    return map(lambda row: Vec.SparseBool(true_indices=list(row.indices), total_indices=row.shape[-1]), csr)
def dealias_metric(metric: str) -> str
Expand source code
def dealias_metric(metric: str) -> str:
    mlower = metric.lower()
    if mlower == 'euclidean':
        return 'l2'
    elif mlower == 'angular':
        return 'cosine'
    else:
        return mlower
def float_vectors_to_ndarray(fvs: List[Vec.DenseFloat]) -> numpy.ndarray
Expand source code
def float_vectors_to_ndarray(fvs: List[Vec.DenseFloat]) -> np.ndarray:
    arr = np.zeros(shape=(len(fvs), len(fvs[0].values)))
    for i, fv in enumerate(fvs):
        arr[i] = list(fv.values)
    return arr
def ndarray_to_dense_float_vectors(arr: numpy.ndarray) -> Iterator[Vec.DenseFloat]
Expand source code
def ndarray_to_dense_float_vectors(arr: np.ndarray) -> Iterator[Vec.DenseFloat]:
    return map(lambda row: Vec.DenseFloat(values=list(map(float, row))), arr)
def ndarray_to_sparse_bool_vectors(arr: numpy.ndarray) -> Iterator[Vec.SparseBool]
Expand source code
def ndarray_to_sparse_bool_vectors(arr: np.ndarray) -> Iterator[Vec.SparseBool]:
    return map(lambda row: Vec.SparseBool(true_indices=list(map(int, np.where(row)[0])), total_indices=len(row)), arr)
def sparse_bool_vectors_to_csr(sbvs: List[Vec.SparseBool]) -> scipy.sparse._csr.csr_matrix
Expand source code
def sparse_bool_vectors_to_csr(sbvs: List[Vec.SparseBool]) -> csr_matrix:
    rows, cols, data = [], [], []
    for row, sbv in enumerate(sbvs):
        for col in sbv.true_indices:
            cols.append(col)
            rows.append(row)
            data.append(True)
    return csr_matrix((data, (rows, cols)), shape=(len(sbvs), sbvs[0].total_indices), dtype=bool)