标题：海量数据处理：找出现次数最多的那些... 出处：Felix021 时间：Sun, 14 Jun 2009 17:34:46 +0000 作者：felix021 地址：https://www.felix021.com/blog/read.php?1652 内容：回忆一下tx的二面，有一道题是这样的：假设有1kw个身份证号，以及他们对应的数据。身份证号可能重复，要求找出出现次数最多的身份证号。一个很显然的做法是，hash之，O(n)搞定。这前提是内存中可以存下。如果是中国的13亿人口，内存中存不下呢？借用磁盘，多次扫描？磁盘IO的速度慢得能让你疯掉。这时候你终于感受到，中国人口真TMD多阿。。。 -- 然后再看看今天的astar复赛第一题：百度每天都会接受数亿的查询请求, 如何在这么多的查询(Query)中找出高频的Query是一个不小的挑战. 而你的任务则更加艰巨, 你需要在极其有限的资源下来找出这些高频的Query.(使用内存不得多于1MB) 。输入文件是一行一个Query, 以文件结束符结尾。每个Query字节数L(一个汉字两个字节)满足：0 #include #include #include #include using namespace std; const int hashm = 103; inline int hash(char *p){ int t = 0; while (*p){ t = (*p++) + (t << 6) + (t << 16) - t; } return ((t & 0x7fffffff) % hashm); } struct node_t{ char s[17]; int count; node_t& operator=(const char *t){ strcpy(s, t); count = 1; return (*this); } }; struct great_node_t{ char *s; int count; bool operator<(const great_node_t &a)const{ return (count > a.count); } great_node_t(const char *_s, int &_count){ s = (char*)_s; count = _count; } }; node_t tbl[hashm][100]; int tblc[hashm]; vector ans; int main(){ char t[20]; int i, j; int v; //freopen("a.in", "r", stdin); for (i = 0; i < hashm; ++i){ tblc[i] = 0; } while (scanf("%s", t) == 1){ v = hash(t); for(i = 0, j = 0; i < tblc[v]; ++i){ if(strcmp(tbl[v][i].s, t) == 0){ tbl[v][i].count++; break; } if(tbl[v][i].count < tbl[v][j].count){ j = i; } } if(i == tblc[v]){ if(i < 99){ tblc[v]++; tbl[v][i] = t; }else{ tbl[v][j] = t; } } } for (i = 0; i < hashm; ++i){ for (j = 0; j < tblc[i]; ++j){ ans.push_back(great_node_t(tbl[i][j].s, tbl[i][j].count)); } } sort(ans.begin(), ans.end()); for(int i = 0; i < 100 && i < (int)ans.size(); ++i){ printf("(%d) %s\n", ans[i].count, ans[i].s); } return 0; } //数据生成代码 #include #include #include using namespace std; int main(){ int i, j; srand(time(NULL)); const int maxline = 1000000; freopen("a.in", "w", stdout); for (i = 0; i < maxline; ++i){ int t = rand() % 15 + 1; for (j = 0; j < t; ++j){ printf("%c", rand() % 26 + 'a'); } printf("\n"); } return 0; } Generated by Bo-blog 2.1.0