// Licensed under the MIT License
// https://github.com/craigahobbs/bare-script/blob/main/LICENSE
/** @module lib/data */
import {valueBoolean, valueCompare, valueJSON, valueParseDatetime, valueParseNumber} from './value.js';
import {evaluateExpression} from './runtime.js';
import {parseExpression} from './parser.js';
import {parseSchemaMarkdown} from 'schema-markdown/lib/parser.js';
import {validateType} from 'schema-markdown/lib/schema.js';
/**
* Parse and validate CSV text to a data array
*
* @param {string} text - The CSV text
* @returns {Object[]} The data array
*/
export function parseCSV(text) {
// Line-split the text
const lines = [];
if (typeof text === 'string') {
lines.push(...text.split(rCSVLineSplit));
} else {
for (const textPart of text) {
lines.push(...textPart.split(rCSVLineSplit));
}
}
// Split lines into rows
const rows = lines.filter((line) => !line.match(rCSVBlankLine)).map((line) => {
const row = [];
let linePart = line;
while (linePart !== null) {
// Quoted field?
const mQuoted = linePart.match(rCSVQuotedField) ?? linePart.match(rCSVQuotedFieldEnd);
if (mQuoted !== null) {
row.push(mQuoted[1].replaceAll(rCSVQuoteEscape, '"'));
linePart = linePart.slice(mQuoted[0].length);
continue;
}
// Non-quoted field
const ixComma = linePart.indexOf(',');
row.push(ixComma !== -1 ? linePart.slice(0, ixComma) : linePart);
linePart = (ixComma !== -1 ? linePart.slice(ixComma + 1) : null);
}
return row;
});
// Assemble the data rows
const result = [];
if (rows.length >= 2) {
const fields = rows[0].map((field) => field.trim());
for (let ixLine = 1; ixLine < rows.length; ixLine += 1) {
const row = rows[ixLine];
result.push(Object.fromEntries(fields.map(
(field, ixField) => [field, ixField < row.length ? row[ixField] : null]
)));
}
}
return result;
}
const rCSVLineSplit = /\r?\n/;
const rCSVBlankLine = /^\s*$/;
const rCSVQuotedField = /^"((?:""|[^"])*)",/;
const rCSVQuotedFieldEnd = /^"((?:""|[^"])*)"\s*$/;
const rCSVQuoteEscape = /""/g;
/**
* Determine data field types and parse/validate field values
*
* @param {Object[]} data - The data array. Row objects are updated with parsed/validated values.
* @param {boolean} [csv=false] - If true, parse value strings
* @returns {Object} The map of field name to field type ("boolean", "datetime", "number", "string")
* @throws Throws an error if data is invalid
*/
export function validateData(data, csv = false) {
// Determine field types
const types = {};
for (const row of data) {
for (const [field, value] of Object.entries(row)) {
if ((types[field] ?? null) === null) {
if (typeof value === 'boolean') {
types[field] = 'boolean';
} if (typeof value === 'number') {
types[field] = 'number';
} else if (value instanceof Date) {
types[field] = 'datetime';
} else if (typeof value === 'string') {
// If we aren't parsing CSV strings, its just a string
if (!csv) {
types[field] = 'string';
// If its the null string we can't determine the type yet
} else if (value === '' || value === 'null') {
types[field] = null;
// Can the string be parsed into another type?
} else if (valueParseDatetime(value) !== null) {
types[field] = 'datetime';
} else if (value === 'true' || value === 'false') {
types[field] = 'boolean';
} else if (valueParseNumber(value) !== null) {
types[field] = 'number';
} else {
types[field] = 'string';
}
}
}
}
}
// Set the type for fields with undetermined type
for (const [field, fieldType] of Object.entries(types)) {
if (fieldType === null) {
types[field] = 'string';
}
}
// Helper to format and raise validation errors
const throwFieldError = (field, fieldType, fieldValue) => {
throw new Error(`Invalid "${field}" field value ${valueJSON(fieldValue)}, expected type ${fieldType}`);
};
// Validate field values
for (const row of data) {
for (const [field, value] of Object.entries(row)) {
const fieldType = types[field] ?? null;
if (fieldType === null) {
continue;
}
// Null string?
if (csv && value === 'null') {
row[field] = null;
// Number field
} else if (fieldType === 'number') {
if (csv && typeof value === 'string') {
let numberValue;
if (value === '') {
numberValue = null;
} else {
numberValue = valueParseNumber(value);
if (numberValue === null) {
throwFieldError(field, fieldType, value);
}
}
row[field] = numberValue;
} else if (value !== null && typeof value !== 'number') {
throwFieldError(field, fieldType, value);
}
// Datetime field
} else if (fieldType === 'datetime') {
if (csv && typeof value === 'string') {
let datetimeValue;
if (value === '') {
datetimeValue = null;
} else {
datetimeValue = valueParseDatetime(value);
if (datetimeValue === null) {
throwFieldError(field, fieldType, value);
}
}
row[field] = datetimeValue;
} else if (value !== null && !(value instanceof Date)) {
throwFieldError(field, fieldType, value);
}
// Boolean field
} else if (fieldType === 'boolean') {
if (csv && typeof value === 'string') {
let booleanValue;
if (value === '') {
booleanValue = null;
} else {
booleanValue = (value === 'true' ? true : (value === 'false' ? false : null));
if (booleanValue === null) {
throwFieldError(field, fieldType, value);
}
}
row[field] = booleanValue;
} else if (value !== null && typeof value !== 'boolean') {
throwFieldError(field, fieldType, value);
}
// String field
} else {
if (value !== null && typeof value !== 'string') {
throwFieldError(field, fieldType, value);
}
}
}
}
return types;
}
/**
* Join two data arrays
*
* @param {Object} leftData - The left data array
* @param {Object} rightData - The left data array
* @param {string} joinExpr - The join [expression](./language/#expressions)
* @param {?string} [rightExpr = null] - The right join [expression](./language/#expressions)
* @param {boolean} [isLeftJoin = false] - If true, perform a left join (always include left row)
* @param {?Object} [variables = null] - Additional variables for expression evaluation
* @param {?Object} [options = null] - The [script execution options]{@link module:lib/options~ExecuteScriptOptions}
* @returns {Object[]} The joined data array
*/
export function joinData(leftData, rightData, joinExpr, rightExpr = null, isLeftJoin = false, variables = null, options = null) {
// Compute the map of row field name to joined row field name
const leftNames = {};
const rightNamesRaw = {};
const rightNames = {};
for (const row of leftData) {
for (const fieldName of Object.keys(row)) {
if (!(fieldName in leftNames)) {
leftNames[fieldName] = fieldName;
}
}
}
for (const row of rightData) {
for (const fieldName of Object.keys(row)) {
if (!(fieldName in rightNames)) {
rightNamesRaw[fieldName] = fieldName;
}
}
}
for (const fieldName of Object.keys(rightNamesRaw)) {
if (!(fieldName in leftNames)) {
rightNames[fieldName] = fieldName;
} else {
let ixUnique = 2;
let uniqueName = `${fieldName}${ixUnique}`;
while (uniqueName in leftNames || uniqueName in rightNames || uniqueName in rightNamesRaw) {
ixUnique += 1;
uniqueName = `${fieldName}${ixUnique}`;
}
rightNames[fieldName] = uniqueName;
}
}
// Create the evaluation options object
let evalOptions = options;
if (variables !== null) {
evalOptions = (options !== null ? {...options} : {});
if ('globals' in evalOptions) {
evalOptions.globals = {...evalOptions.globals, ...variables};
} else {
evalOptions.globals = variables;
}
}
// Parse the left and right expressions
const leftExpression = parseExpression(joinExpr);
const rightExpression = (rightExpr !== null ? parseExpression(rightExpr) : leftExpression);
// Bucket the right rows by the right expression value
const rightCategoryRows = {};
for (const rightRow of rightData) {
const categoryKey = valueJSON(evaluateExpression(rightExpression, evalOptions, rightRow));
if (!(categoryKey in rightCategoryRows)) {
rightCategoryRows[categoryKey] = [];
}
rightCategoryRows[categoryKey].push(rightRow);
}
// Join the left with the right
const data = [];
for (const leftRow of leftData) {
const categoryKey = valueJSON(evaluateExpression(leftExpression, evalOptions, leftRow));
if (categoryKey in rightCategoryRows) {
for (const rightRow of rightCategoryRows[categoryKey]) {
const joinRow = {...leftRow};
for (const [rightName, rightValue] of Object.entries(rightRow)) {
joinRow[rightNames[rightName]] = rightValue;
}
data.push(joinRow);
}
} else if (!isLeftJoin) {
data.push({...leftRow});
}
}
return data;
}
/**
* Add a calculated field to each row of a data array
*
* @param {Object[]} data - The data array. Row objects are updated with the calculated field values.
* @param {string} fieldName - The calculated field name
* @param {string} expr - The calculated field expression
* @param {?Object} [variables = null] - Additional variables for expression evaluation
* @param {?Object} [options = null] - The [script execution options]{@link module:lib/options~ExecuteScriptOptions}
* @returns {Object[]} The updated data array
*/
export function addCalculatedField(data, fieldName, expr, variables = null, options = null) {
// Parse the calculation expression
const calcExpr = parseExpression(expr);
// Create the evaluation options object
let evalOptions = options;
if (variables !== null) {
evalOptions = (options !== null ? {...options} : {});
if ('globals' in evalOptions) {
evalOptions.globals = {...evalOptions.globals, ...variables};
} else {
evalOptions.globals = variables;
}
}
// Compute the calculated field for each row
for (const row of data) {
row[fieldName] = evaluateExpression(calcExpr, evalOptions, row);
}
return data;
}
/**
* Filter data rows
*
* @param {Object[]} data - The data array
* @param {string} expr - The boolean filter [expression](./language/#expressions)
* @param {?Object} [variables = null] - Additional variables for expression evaluation
* @param {?Object} [options = null] - The [script execution options]{@link module:lib/options~ExecuteScriptOptions}
* @returns {Object[]} The filtered data array
*/
export function filterData(data, expr, variables = null, options = null) {
const result = [];
// Parse the filter expression
const filterExpr = parseExpression(expr);
// Create the evaluation options object
let evalOptions = options;
if (variables !== null) {
evalOptions = (options !== null ? {...options} : {});
if ('globals' in evalOptions) {
evalOptions.globals = {...evalOptions.globals, ...variables};
} else {
evalOptions.globals = variables;
}
}
// Filter the data
for (const row of data) {
if (valueBoolean(evaluateExpression(filterExpr, evalOptions, row))) {
result.push(row);
}
}
return result;
}
/**
* Aggregate data rows
*
* @param {Object[]} data - The data array
* @param {Object} aggregation - The [aggregation model](./library/model.html#var.vName='Aggregation')
* @returns {Object[]} The aggregated data array
*/
export function aggregateData(data, aggregation) {
// Validate the aggregation model
validateType(aggregationTypes, 'Aggregation', aggregation);
const categories = aggregation.categories ?? null;
// Create the aggregate rows
const categoryRows = {};
for (const row of data) {
// Compute the category values
const categoryValues = (categories !== null ? categories.map((categoryField) => row[categoryField]) : null);
// Get or create the aggregate row
let aggregateRow;
const rowKey = (categoryValues !== null ? valueJSON(categoryValues) : '');
if (rowKey in categoryRows) {
aggregateRow = categoryRows[rowKey];
} else {
aggregateRow = {};
categoryRows[rowKey] = aggregateRow;
if (categories !== null) {
for (let ixCategoryField = 0; ixCategoryField < categories.length; ixCategoryField++) {
aggregateRow[categories[ixCategoryField]] = categoryValues[ixCategoryField];
}
}
}
// Add to the aggregate measure values
for (const measure of aggregation.measures) {
const field = measure.name ?? measure.field;
const value = row[measure.field] ?? null;
if (!(field in aggregateRow)) {
aggregateRow[field] = [];
}
if (value !== null) {
aggregateRow[field].push(value);
}
}
}
// Compute the measure values aggregate function value
const aggregateRows = Object.values(categoryRows);
for (const aggregateRow of aggregateRows) {
for (const measure of aggregation.measures) {
const field = measure.name ?? measure.field;
const func = measure.function;
const measureValues = aggregateRow[field];
if (!measureValues.length) {
aggregateRow[field] = null;
} else if (func === 'count') {
aggregateRow[field] = measureValues.length;
} else if (func === 'max') {
aggregateRow[field] = measureValues.reduce((max, val) => (val > max ? val : max));
} else if (func === 'min') {
aggregateRow[field] = measureValues.reduce((min, val) => (val < min ? val : min));
} else if (func === 'sum') {
aggregateRow[field] = measureValues.reduce((sum, val) => sum + val, 0);
} else if (func === 'stddev') {
const average = measureValues.reduce((sum, val) => sum + val, 0) / measureValues.length;
aggregateRow[field] = Math.sqrt(measureValues.reduce((sum, val) => sum + (val - average) ** 2, 0) / measureValues.length);
} else {
// func === 'average'
aggregateRow[field] = measureValues.reduce((sum, val) => sum + val, 0) / measureValues.length;
}
}
}
return aggregateRows;
}
// The aggregation model
export const aggregationTypes = parseSchemaMarkdown(`\
group "Aggregation"
# A data aggregation specification
struct Aggregation
# The aggregation category fields
optional string[len > 0] categories
# The aggregation measures
AggregationMeasure[len > 0] measures
# An aggregation measure specification
struct AggregationMeasure
# The aggregation measure field
string field
# The aggregation function
AggregationFunction function
# The aggregated-measure field name
optional string name
# An aggregation function
enum AggregationFunction
# The average of the measure's values
average
# The count of the measure's values
count
# The greatest of the measure's values
max
# The least of the measure's values
min
# The standard deviation of the measure's values
stddev
# The sum of the measure's values
sum
`);
/**
* Sort data rows
*
* @param {Object[]} data - The data array
* @param {Object[]} sorts - The sort field-name/descending-sort tuples
* @returns {Object[]} The sorted data array
*/
export function sortData(data, sorts) {
return data.sort((row1, row2) => sorts.reduce((result, sort) => {
if (result !== 0) {
return result;
}
const [field, desc = false] = sort;
const value1 = row1[field] ?? null;
const value2 = row2[field] ?? null;
const compare = valueCompare(value1, value2);
return desc ? -compare : compare;
}, 0));
}
/**
* Top data rows
*
* @param {Object[]} data - The data array
* @param {number} count - The number of rows to keep
* @param {?string[]} [categoryFields = null] - The category fields
* @returns {Object[]} The top data array
*/
export function topData(data, count, categoryFields = null) {
// Bucket rows by category
const categoryRows = {};
const categoryOrder = [];
for (const row of data) {
const categoryKey = categoryFields === null ? ''
: valueJSON(categoryFields.map((field) => (field in row ? row[field] : null)));
if (!(categoryKey in categoryRows)) {
categoryRows[categoryKey] = [];
categoryOrder.push(categoryKey);
}
categoryRows[categoryKey].push(row);
}
// Take only the top rows
const dataTop = [];
const topCount = count;
for (const categoryKey of categoryOrder) {
const categoryKeyRows = categoryRows[categoryKey];
const categoryKeyLength = categoryKeyRows.length;
for (let ixRow = 0; ixRow < topCount && ixRow < categoryKeyLength; ixRow++) {
dataTop.push(categoryKeyRows[ixRow]);
}
}
return dataTop;
}