So, the code I'm using now, is what I wrote for this previous post. It was supposed to be simple to understand, and overall gets the job done. But we can do better. Let's start with some profiling, and establish a baseline performance.

float t = tMax;
// Bruteforce approach bool hit_anything = false; HitRecord tmp_hit; for(auto& tri : mTris) { if(tri.hit(r,tMin,t,tmp_hit)) { collision = tmp_hit; t = tmp_hit.t; hit_anything = true; } }
auto edge0 = v[1]-v[0]; auto edge1 = v[2]-v[1]; auto normal = normalize(cross(edge0,edge1)); auto planeOffset = dot(v[0],normal);
template<uint16_t address_> struct Register { void operator= (uint8_t _r) { *reinterpret_cast<volatile uint8_t*>(address_) = _r; } operator uint8_t () const { return *reinterpret_cast<volatile uint8_t*>(address_); } operator volatile uint8_t& () { return *reinterpret_cast<volatile uint8_t*>(address_); } template<uint8_t bit_> void setBit() { *reinterpret_cast<volatile uint8_t*>(address_) |= (1 << bit_); } template<uint8_t bit_> void clearBit() { *reinterpret_cast<volatile uint8_t*>(address_) &= ~(1 << bit_); } }; Register<0x24> DDRB; Register<0x25> PORTB; constexpr uint8_t DDB5 = 5; constexpr uint8_t PORTB5 = 5;