Files
dreport/layout-engine/src/font_meta.rs

331 lines
9.9 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
use serde::{Deserialize, Serialize};
/// Parsed metadata from a single font file
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FontMeta {
/// Font family name from name table (nameID 16 preferred, fallback nameID 1)
pub family: String,
/// usWeightClass from OS/2 table (100-900)
pub weight: u16,
/// fsSelection bit 0 from OS/2 table
pub italic: bool,
pub units_per_em: u16,
/// sTypoAscender from OS/2 table
pub ascender: i16,
/// sTypoDescender from OS/2 table
pub descender: i16,
}
/// Variant key for looking up a specific font within a family
#[derive(Debug, Clone, Hash, Eq, PartialEq, Serialize, Deserialize)]
pub struct FontVariantKey {
pub weight: u16,
pub italic: bool,
}
/// Summary of a font family with all its available variants
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FontFamilyInfo {
pub family: String,
pub variants: Vec<FontVariantKey>,
}
impl FontMeta {
pub fn variant_key(&self) -> FontVariantKey {
FontVariantKey {
weight: self.weight,
italic: self.italic,
}
}
pub fn is_bold(&self) -> bool {
self.weight >= 700
}
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/// Read a big-endian u16 from `data` at `offset`. Returns `None` if out of bounds.
fn read_u16(data: &[u8], offset: usize) -> Option<u16> {
if offset + 2 > data.len() {
return None;
}
Some(u16::from_be_bytes([data[offset], data[offset + 1]]))
}
/// Read a big-endian i16 from `data` at `offset`. Returns `None` if out of bounds.
fn read_i16(data: &[u8], offset: usize) -> Option<i16> {
if offset + 2 > data.len() {
return None;
}
Some(i16::from_be_bytes([data[offset], data[offset + 1]]))
}
/// Read a big-endian u32 from `data` at `offset`. Returns `None` if out of bounds.
fn read_u32(data: &[u8], offset: usize) -> Option<u32> {
if offset + 4 > data.len() {
return None;
}
Some(u32::from_be_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
]))
}
/// Find a table in the font's table directory by its 4-byte ASCII tag.
/// Returns `(offset, length)` into `data`.
fn find_table(data: &[u8], tag: &[u8; 4]) -> Option<(usize, usize)> {
// Offset table (first 12 bytes):
// 0: sfVersion (u32) — 0x00010000 for TrueType, 'OTTO' for CFF
// 4: numTables (u16)
// 6: searchRange (u16)
// 8: entrySelector (u16)
// 10: rangeShift (u16)
let num_tables = read_u16(data, 4)? as usize;
// Table directory starts at offset 12, each entry is 16 bytes:
// 0: tag (4 bytes)
// 4: checksum (u32)
// 8: offset (u32)
// 12: length (u32)
for i in 0..num_tables {
let entry_offset = 12 + i * 16;
if entry_offset + 16 > data.len() {
return None;
}
if &data[entry_offset..entry_offset + 4] == tag {
let table_offset = read_u32(data, entry_offset + 8)? as usize;
let table_length = read_u32(data, entry_offset + 12)? as usize;
// Basic sanity check
if table_offset.checked_add(table_length)? > data.len() {
return None;
}
return Some((table_offset, table_length));
}
}
None
}
/// Decode a UTF-16BE byte slice into a `String`.
fn decode_utf16be(raw: &[u8]) -> Option<String> {
if raw.len() % 2 != 0 {
return None;
}
let code_units: Vec<u16> = raw
.chunks_exact(2)
.map(|c| u16::from_be_bytes([c[0], c[1]]))
.collect();
String::from_utf16(&code_units).ok()
}
/// Decode a MacRoman (platform 1, encoding 0) byte slice into a `String`.
/// MacRoman overlaps with ASCII for 0x000x7F; we accept those and replace
/// high bytes with the Unicode replacement character for simplicity, since
/// font family names are almost always pure ASCII.
fn decode_mac_roman(raw: &[u8]) -> String {
raw.iter()
.map(|&b| {
if b < 0x80 {
b as char
} else {
// Simplified: map non-ASCII MacRoman bytes to replacement char.
// Full MacRoman table not needed for typical font family names.
'\u{FFFD}'
}
})
.collect()
}
/// Extract the font family name from the `name` table.
///
/// Prefers nameID 16 (Typographic Family Name) over nameID 1 (Font Family).
/// Among platforms, prefers Windows (3) and Unicode (0) for UTF-16BE, falls
/// back to Macintosh (1) for MacRoman.
fn read_family_name(data: &[u8], table_offset: usize, table_length: usize) -> Option<String> {
let tbl = table_offset;
// name table header:
// 0: format (u16)
// 2: count (u16)
// 4: stringOffset (u16) — offset from start of table to string storage
let count = read_u16(data, tbl + 2)? as usize;
let string_offset = read_u16(data, tbl + 4)? as usize;
let storage_base = tbl + string_offset;
// Each name record (12 bytes, starting at tbl + 6):
// 0: platformID (u16)
// 2: encodingID (u16)
// 4: languageID (u16)
// 6: nameID (u16)
// 8: length (u16)
// 10: offset (u16) — from storage_base
// We collect candidates, preferring nameID 16 over 1, and Windows/Unicode
// over Mac.
let mut best: Option<String> = None;
let mut best_priority: u8 = 0; // higher = better
for i in 0..count {
let rec = tbl + 6 + i * 12;
if rec + 12 > tbl + table_length {
break;
}
let platform_id = read_u16(data, rec)?;
let encoding_id = read_u16(data, rec + 2)?;
let name_id = read_u16(data, rec + 6)?;
let str_length = read_u16(data, rec + 8)? as usize;
let str_offset = read_u16(data, rec + 10)? as usize;
// Only interested in nameID 1 (Font Family) or 16 (Typographic Family)
if name_id != 1 && name_id != 16 {
continue;
}
let name_priority = if name_id == 16 { 4 } else { 0 };
let abs_start = storage_base + str_offset;
let abs_end = abs_start + str_length;
if abs_end > data.len() {
continue;
}
let raw = &data[abs_start..abs_end];
let (decoded, platform_priority) = match platform_id {
// Platform 0 — Unicode: UTF-16BE
0 => {
if let Some(s) = decode_utf16be(raw) {
(s, 2u8)
} else {
continue;
}
}
// Platform 1 — Macintosh, encoding 0 = MacRoman
1 if encoding_id == 0 => (decode_mac_roman(raw), 1u8),
// Platform 3 — Windows, encoding 1 = Unicode BMP (UTF-16BE)
3 if encoding_id == 1 => {
if let Some(s) = decode_utf16be(raw) {
(s, 3u8)
} else {
continue;
}
}
_ => continue,
};
let priority = name_priority + platform_priority;
if priority > best_priority {
best_priority = priority;
best = Some(decoded);
}
}
best
}
/// Parse font metadata from raw TTF/OTF bytes.
///
/// Returns `None` if the data is too short, tables are missing, or offsets
/// point outside the buffer.
pub fn parse_font_meta(data: &[u8]) -> Option<FontMeta> {
// Minimum: 12-byte offset table header
if data.len() < 12 {
return None;
}
// ---- OS/2 table ----
let (os2_off, os2_len) = find_table(data, b"OS/2")?;
// Need at least 72 bytes for sTypoDescender (offset 70, 2 bytes)
if os2_len < 72 {
return None;
}
let weight = read_u16(data, os2_off + 4)?;
let fs_selection = read_u16(data, os2_off + 62)?;
let italic = (fs_selection & 1) != 0;
let ascender = read_i16(data, os2_off + 68)?;
let descender = read_i16(data, os2_off + 70)?;
// ---- head table ----
let (head_off, head_len) = find_table(data, b"head")?;
// unitsPerEm is at offset 18 (2 bytes), so need at least 20 bytes
if head_len < 20 {
return None;
}
let units_per_em = read_u16(data, head_off + 18)?;
// ---- name table ----
let (name_off, name_len) = find_table(data, b"name")?;
let family = read_family_name(data, name_off, name_len)?;
Some(FontMeta {
family,
weight,
italic,
units_per_em,
ascender,
descender,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn find_table_returns_none_on_empty() {
assert!(find_table(&[], b"head").is_none());
}
#[test]
fn parse_font_meta_returns_none_on_garbage() {
assert!(parse_font_meta(&[0u8; 11]).is_none());
assert!(parse_font_meta(&[0u8; 64]).is_none());
}
#[test]
fn variant_key_and_is_bold() {
let meta = FontMeta {
family: "Test".into(),
weight: 700,
italic: true,
units_per_em: 1000,
ascender: 800,
descender: -200,
};
assert!(meta.is_bold());
assert!(meta.italic);
let key = meta.variant_key();
assert_eq!(key.weight, 700);
assert!(key.italic);
let regular = FontMeta {
weight: 400,
italic: false,
..meta.clone()
};
assert!(!regular.is_bold());
}
#[test]
fn decode_utf16be_basic() {
// "AB" in UTF-16BE
let raw = [0x00, 0x41, 0x00, 0x42];
assert_eq!(decode_utf16be(&raw).unwrap(), "AB");
}
#[test]
fn decode_utf16be_odd_length_returns_none() {
assert!(decode_utf16be(&[0x00, 0x41, 0x00]).is_none());
}
#[test]
fn decode_mac_roman_ascii() {
let raw = b"Noto Sans";
assert_eq!(decode_mac_roman(raw), "Noto Sans");
}
}