If you prefer chinese, see http://cpp.winxgui.com/cn:allocators-performance-comparison.
Summary
In this article we'll talk about performance comparison of:
- new/delete
- AutoFreeAlloc
- ScopeAlloc
Because most of allocators focus to improve performance of small objects allocation, we only test to allocate small objects here.
Test Environment
- CPU: 1.2 G
- OS: Windows XP
- Compiler: Visual C++ 6.0
- Optimization: Maximize speed
- C Runtime library: Multithreaded DLL
- Settings: Win32 Release
Condition 1: One allocator instance only allocate a few small objects
Test method: One allocator instance only allocate one "int" object.
Source code (See <stdext/Memory.h>):
template <class LogT> class TestCompareAllocators : public TestCase { WINX_TEST_SUITE(TestCompareAllocators); WINX_TEST(testComparison1); WINX_TEST_SUITE_END(); public: enum { N = 60000 }; void doNewDelete1(LogT& log) { log.print("===== NewDelete =====\n"); PerformanceCounter counter; for (int i = 0; i < N; ++i) { int* p = new int; delete p; } counter.trace(log); } void doAutoFreeAlloc1(LogT& log) { log.print("===== AutoFreeAlloc =====\n"); PerformanceCounter counter; for (int i = 0; i < N; ++i) { AutoFreeAlloc alloc; int* p = STD_NEW(alloc, int); } counter.trace(log); } void doScopeAlloc1(LogT& log) { log.print("===== ScopeAlloc =====\n"); BlockPool recycle; PerformanceCounter counter; for (int i = 0; i < N; ++i) { ScopeAlloc alloc(recycle); int* p = STD_NEW(alloc, int); } counter.trace(log); } void testComparison1(LogT& log) { for (int i = 0; i < 4; ++i) { log.newline(); doAutoFreeAlloc1(log); doNewDelete1(log); doScopeAlloc1(log); } } };
Output:
===== AutoFreeAlloc =====
---> Elapse 283773 ticks (79.28 ms) (0.00 min) ...
===== NewDelete =====
---> Elapse 134936 ticks (37.70 ms) (0.00 min) ...
===== ScopeAlloc =====
---> Elapse 8285 ticks (2.31 ms) (0.00 min) ...
===== AutoFreeAlloc =====
---> Elapse 184090 ticks (51.43 ms) (0.00 min) ...
===== NewDelete =====
---> Elapse 116476 ticks (32.54 ms) (0.00 min) ...
===== ScopeAlloc =====
---> Elapse 9831 ticks (2.75 ms) (0.00 min) ...
===== AutoFreeAlloc =====
---> Elapse 179652 ticks (50.19 ms) (0.00 min) ...
===== NewDelete =====
---> Elapse 130553 ticks (36.47 ms) (0.00 min) ...
===== ScopeAlloc =====
---> Elapse 8387 ticks (2.34 ms) (0.00 min) ...
===== AutoFreeAlloc =====
---> Elapse 137255 ticks (38.34 ms) (0.00 min) ...
===== NewDelete =====
---> Elapse 103869 ticks (29.02 ms) (0.00 min) ...
===== ScopeAlloc =====
---> Elapse 8215 ticks (2.29 ms) (0.00 min) ...
Conclusion:
When an allocator instance only allocate a few memory, AutoFreeAlloc has the worst performance, new/delete is worse, and ScopeAlloc has good performance however.
Condition 2: One allocator instance allocate a large amount of small objects
Test method: One allocator instance allocate 6000 "int" objects.
Source code (See <Memory.h>):
template <class LogT> class TestCompareAllocators : public TestCase { WINX_TEST_SUITE(TestCompareAllocators); WINX_TEST(testComparison2); WINX_TEST_SUITE_END(); public: enum { N = 60000 }; void doNewDelete2(LogT& log) { int i, *p[N]; log.print("===== NewDelete =====\n"); PerformanceCounter counter; for (i = 0; i < N; ++i) { p[i] = new int; } for (i = 0; i < N; ++i) { delete p[i]; } counter.trace(log); } void doAutoFreeAlloc2(LogT& log) { log.print("===== AutoFreeAlloc =====\n"); PerformanceCounter counter; { AutoFreeAlloc alloc; for (int i = 0; i < N; ++i) { int* p = STD_NEW(alloc, int); } } counter.trace(log); } void doScopeAlloc2(LogT& log) { log.print("===== ScopeAlloc =====\n"); BlockPool recycle; PerformanceCounter counter; { ScopeAlloc alloc(recycle); for (int i = 0; i < N; ++i) { int* p = STD_NEW(alloc, int); } } counter.trace(log); } void testComparison2(LogT& log) { for (int i = 0; i < 4; ++i) { log.newline(); doAutoFreeAlloc2(log); doNewDelete2(log); doScopeAlloc2(log); } } };
Output:
===== AutoFreeAlloc =====
---> Elapse 1771 ticks (0.49 ms) (0.00 min) ...
===== NewDelete =====
---> Elapse 117791 ticks (32.91 ms) (0.00 min) ...
===== ScopeAlloc =====
---> Elapse 2657 ticks (0.74 ms) (0.00 min) ...
===== AutoFreeAlloc =====
---> Elapse 1637 ticks (0.46 ms) (0.00 min) ...
===== NewDelete =====
---> Elapse 114280 ticks (31.93 ms) (0.00 min) ...
===== ScopeAlloc =====
---> Elapse 2516 ticks (0.70 ms) (0.00 min) ...
===== AutoFreeAlloc =====
---> Elapse 1668 ticks (0.47 ms) (0.00 min) ...
===== NewDelete =====
---> Elapse 118661 ticks (33.15 ms) (0.00 min) ...
===== ScopeAlloc =====
---> Elapse 2553 ticks (0.71 ms) (0.00 min) ...
===== AutoFreeAlloc =====
---> Elapse 1678 ticks (0.47 ms) (0.00 min) ...
===== NewDelete =====
---> Elapse 113231 ticks (31.63 ms) (0.00 min) ...
===== ScopeAlloc =====
---> Elapse 3287 ticks (0.92 ms) (0.00 min) ...
Conclusion:
When an allocator instance allocate a large amount of objects, AutoFreeAlloc has the best performance, ScopeAlloc is better (and very close to AutoFreeAlloc), and new/delete has the worst performance.
Suggestions
Suggestion 1
AutoFreeAlloc is useful in some special conditions (eg. in an IO process, in a Layout Render process, etc). We use it in "The fastest word file writer".
Suggestion 2
If you don't know you want AutoFreeAlloc or ScopeAlloc, please choose ScopeAlloc. It have good performance in any condition indeed. If you need a timely memory recycle, ScopeAlloc is the only choice.
A complex example of ScopeAlloc
int main() { std::BlockPool recycle; std::ScopeAlloc alloc(recycle); printf("\n------------------- global: have 3 objs ----------------\n"); { Obj* a1 = STD_NEW(alloc, Obj)(0); Obj* a2 = STD_NEW_ARRAY(alloc, Obj, 2); printf("------------------- child 1: have 4 objs ----------------\n"); { std::ScopeAlloc child1(alloc); Obj* o1 = STD_NEW(child1, Obj)(1); Obj* o2 = STD_NEW_ARRAY(child1, Obj, 3); printf("------------------- child 11: have 3 objs ----------------\n"); { std::ScopeAlloc* child11 = STD_NEW(child1, std::ScopeAlloc)(child1); Obj* o11 = STD_NEW(*child11, Obj)(11); Obj* o12 = STD_NEW_ARRAY(*child11, Obj, 2); } printf("------------------- leave child 11 ----------------\n"); printf("------------------- child 12: have 3 objs ----------------\n"); { std::ScopeAlloc child12(child1); Obj* o11 = STD_NEW(child12, Obj)(12); Obj* o12 = STD_NEW_ARRAY(child12, Obj, 2); } printf("------------------- leave child 12 ----------------\n"); } printf("------------------- leave child 1 ----------------\n"); printf("------------------- child 2: have 4 objs ----------------\n"); { std::ScopeAlloc child2(alloc); Obj* o1 = STD_NEW(child2, Obj)(2); Obj* o2 = STD_NEW_ARRAY(child2, Obj, 3); } printf("------------------- leave child 2 ----------------\n"); } return 0; }
Output:
------------------- global: have 3 objs ----------------
construct Obj: 0
construct Obj: 0
construct Obj: 0
------------------- child 1: have 4 objs ----------------
construct Obj: 1
construct Obj: 0
construct Obj: 0
construct Obj: 0
------------------- child 11: have 3 objs ----------------
construct Obj: 11
construct Obj: 0
construct Obj: 0
------------------- leave child 11 ----------------
------------------- child 12: have 3 objs ----------------
construct Obj: 12
construct Obj: 0
construct Obj: 0
destruct Obj: 0
destruct Obj: 0
destruct Obj: 12
------------------- leave child 12 ----------------
destruct Obj: 0
destruct Obj: 0
destruct Obj: 11
destruct Obj: 0
destruct Obj: 0
destruct Obj: 0
destruct Obj: 1
------------------- leave child 1 ----------------
------------------- child 2: have 4 objs ----------------
construct Obj: 2
construct Obj: 0
construct Obj: 0
construct Obj: 0
destruct Obj: 0
destruct Obj: 0
destruct Obj: 0
destruct Obj: 2
------------------- leave child 2 ----------------
destruct Obj: 0
destruct Obj: 0
destruct Obj: 0