58问答库 > 用JAVA语言设计一个类,统计一篇英文文章的词频,并按照词频由高到低输出。修改下面代码就行了。

用JAVA语言设计一个类,统计一篇英文文章的词频,并按照词频由高到低输出。修改下面代码就行了。

2024-11-05 23:34:08

推荐回答（2个）

回答（1）：

这题目如果能增加一个类的话会高效很多。。。如果非要在这个框框里面，代码麻烦效率低下呢。

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

public class Article {

//保存文章的内容
String content;
//保存分割后的单词集合
String[] rawWords;
//保存统计后的单词集合
String[] words;
//保存单词对应的词频
int[] wordFreqs;

//构造函数，输入文章内容
//提高部分：从文件中读取
public Article() {
content = "kolya is one of the richest films i've seen in some time . zdenek sverak plays a confirmed old bachelor ( who's likely to remain so ) , who finds his life as a czech cellist increasingly impacted by the five-year old boy that he's taking care of . though it ends rather abruptly-- and i'm whining , 'cause i wanted to spend more time with these characters-- the acting , writing , and production values are as high as , if not higher than , comparable american dramas . this father-and-son delight-- sverak also wrote the script , while his son , jan , directed-- won a golden globe for best foreign language film and , a couple days after i saw it , walked away an oscar . in czech and russian , with english subtitles . ";
}

//对文章根据分隔符进行分词,将结果保存到rawWords数组中
public void splitWord(){
//分词的时候，因为标点符号不参与，所以所有的符号全部替换为空格
final char SPACE = ' ';
content = content.replace('\'', SPACE).replace(',', SPACE).replace('.', SPACE);
content = content.replace('(', SPACE).replace(')', SPACE).replace('-', SPACE);

rawWords = content.split("\\s+");//凡是空格隔开的都算单词，上面替换了', 所以I've 被分成2个 //单词
}

//统计词，遍历数组
public void countWordFreq() {
//将所有出现的字符串放入唯一的set中，不用map,是因为map寻找效率太低了
Set set = new TreeSet();

for(String word: rawWords){
set.add(word);
}

Iterator ite = set.iterator();

List wordsList = new ArrayList();
List freqList = new ArrayList();
//多少个字符串未知，所以用list来保存先
while(ite.hasNext()){
String word = (String) ite.next();

int count = 0;//统计相同字符串的个数
for(String str: rawWords){
if(str.equals(word)){
count++;
}
}

wordsList.add(word);
freqList.add(count++);
}

//存入数组当中
words = wordsList.toArray(new String[0]);

wordFreqs = new int[freqList.size()];
for(int i = 0; i < freqList.size(); i++){
wordFreqs[i] = freqList.get(i);
}

}

//根据词频，将词数组和词频数组进行降序排序
public void sort() {

class Word{
private String word;
private int freq;

public Word(String word, int freq){
this.word = word;
this.freq = freq;
}
}
//注意：此处排序，1）首先按照词频降序排列， 2）如果词频相同，按照字母降序排列，
//如 'abc' > 'ab' >'aa'
class WordComparator implements Comparator{

public int compare(Object o1, Object o2) {
Word word1 = (Word) o1;
Word word2 = (Word) o2;

if(word1.freq < word2.freq){
return 1;
}else if(word1.freq > word2.freq){
return -1;
}else{

int len1 = word1.word.trim().length();
int len2 = word2.word.trim().length();

String min = len1 > len2? word2.word: word1.word;
String max = len1 > len2? word1.word: word2.word;

for(int i = 0; i < min.length(); i++){
if(min.charAt(i) < max.charAt(i)){
return 1;
}
}

return 1;

}
}

}

List wordList = new ArrayList();

for(int i = 0; i < words.length; i++){
wordList.add(new Word(words[i], wordFreqs[i]));
}

Collections.sort(wordList, new WordComparator());

for(int i = 0; i < wordList.size(); i++){
Word wor = (Word) wordList.get(i);

words[i] = wor.word;
wordFreqs[i] = wor.freq;
}

}

//将排序结果输出
public void printResult() {
System.out.println("Total " + words.length + " different words in the content!");

for(int i = 0; i < words.length; i++){
System.out.println(wordFreqs[i] + " " + words[i]);
}
}

//测试类的功能
public static void main(String[] args) {
Article a = new Article();
a.splitWord();
a.countWordFreq();
a.sort();
a.printResult();
}
}

-----------------------
Total 99 different words in the content!
5 and
4 the
4 i
4 a
3 as
2 with
2 who
2 to
2 time
2 sverak
2 son
2 s
2 old
2 of
2 it
2 in
2 his
2 czech
1 zdenek
1 year
1 wrote
1 writing
1 won
1 whining
1 while
1 wanted
1 walked
1 ve
1 values
1 though
1 this
1 these
1 that
1 than
1 taking
1 subtitles
1 spend
1 some
1 so
1 seen
1 script
1 saw
1 russian
1 richest
1 remain
1 rather
1 production
1 plays
1 oscar
1 one
1 not
1 more
1 m
1 likely
1 life
1 language
1 kolya
1 jan
1 is
1 increasingly
1 impacted
1 if
1 higher
1 high
1 he
1 golden
1 globe
1 foreign
1 for
1 five
1 finds
1 films
1 film
1 father
1 english
1 ends
1 dramas
1 directed
1 delight
1 days
1 couple
1 confirmed
1 comparable
1 characters
1 cellist
1 cause
1 care
1 by
1 boy
1 best
1 bachelor
1 away
1 are
1 an
1 american
1 also
1 after
1 acting
1 abruptly

回答（2）：

测试结果为
共123个单词，以下为该文章出现的单词及其出现次数。
--------单词----次数--------
-------and----5--------
-------a----4--------
-------the----4--------
-------as----3--------
-------of----2--------
-------time----2--------
-------czech----2--------
-------son----2--------
-------i----2--------
-------to----2--------
-------old----2--------
-------his----2--------
-------with----2--------
-------it----2--------
-------sverak----2--------
-------in----2--------
-------for----1--------
-------higher----1--------
-------wrote----1--------
-------production----1--------
-------oscar----1--------
-------confirmed----1--------
-------are----1--------
-------zdenek----1--------
-------year----1--------
-------these----1--------
-------ends----1--------
-------comparable----1--------
-------not----1--------
-------he's----1--------
-------russian----1--------
-------'cause----1--------
-------bachelor----1--------
-------saw----1--------
-------language----1--------
-------some----1--------
-------i've----1--------
-------kolya----1--------
-------abruptly----1--------
-------wanted----1--------
-------delight----1--------
-------life----1--------
-------american----1--------
-------rather----1--------
-------best----1--------
-------subtitles----1--------
-------walked----1--------
-------dramas----1--------
-------films----1--------
-------seen----1--------
-------taking----1--------
-------impacted----1--------
-------remain----1--------
-------days----1--------
-------finds----1--------
-------by----1--------
-------plays----1--------
-------though----1--------
-------who----1--------
-------after----1--------
-------more----1--------
-------values----1--------
-------who's----1--------
-------care----1--------
-------jan----1--------
-------so----1--------
-------likely----1--------
-------richest----1--------
-------script----1--------
-------that----1--------
-------than----1--------
-------i'm----1--------
-------acting----1--------
-------foreign----1--------
-------english----1--------
-------this----1--------
-------characters----1--------
-------golden----1--------
-------one----1--------
-------writing----1--------
-------father----1--------
-------while----1--------
-------if----1--------
-------couple----1--------
-------won----1--------
-------globe----1--------
-------film----1--------
-------whining----1--------
-------is----1--------
-------five----1--------
-------cellist----1--------
-------spend----1--------
-------away----1--------
-------directed----1--------
-------an----1--------
-------increasingly----1--------
-------high----1--------
-------boy----1--------
-------also----1--------

以下是源码

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class Article {
// 保存文章的内容
String content;

// 保存分割后的单词集合
String[] rawWords;

// 保存统计后的单词集合
String[] words;

// 保存单词对应的词频
int[] wordFreqs;

// 构造函数，输入文章内容
// 提高部分：从文件中读取
public Article() {
content = "kolya is one of the richest films i've seen in some time . "
+ "zdenek sverak plays a confirmed old bachelor ( who's likely to remain so ) , "
+ "who finds his life as a czech cellist increasingly impacted by the five-year "
+ "old boy that he's taking care of . though it ends rather abruptly-- and i'm "
+ "whining , 'cause i wanted to spend more time with these characters-- the acting , "
+ "writing , and production values are as high as , if not higher than , comparable "
+ "american dramas . this father-and-son delight-- sverak also wrote the script , "
+ "while his son , jan , directed-- won a golden globe for best foreign language film "
+ "and , a couple days after i saw it , walked away an oscar . in czech and russian , "
+ "with english subtitles . ";
}

// 对文章根据分隔符进行分词,将结果保存到rawWords数组中
public void splitWord() {
rawWords = content.split(" [\\.,()]{0,1} {0,1},{0,1} {0,1}|-- |-");
}

// 统计词，遍历数组
public void countWordFreq() {
words = new String[rawWords.length];
wordFreqs = new int[rawWords.length];
int length = 0;
for (int i = 0; i < rawWords.length; i++) {
boolean isExist = false;
int j = 0;
for (; j < length; j++) {
if (words[j].equals(rawWords[i])) {
isExist = true;
break;
}
}
if (isExist)
wordFreqs[j]++;
else {
wordFreqs[length]++;
words[length] = rawWords[i];
length++;
}
}
}

// 根据词频，将词数组和词频数组进行降序排序
public void sort() {
Map value = new HashMap();
for (int i = 0; i < this.words.length; i++) {
if (this.words != null)
value.put(this.words[i], this.wordFreqs[i]);
}
List> info = new ArrayList>(
value.entrySet());
Collections.sort(info, new Comparator>() {
public int compare(Map.Entry obj1,
Map.Entry obj2) {
return obj2.getValue() - obj1.getValue();
}
});
this.words = new String[info.size()];
this.wordFreqs = new int[info.size()];
for(int i = 0; i < words.length; i++) {
this.words[i] = info.get(i).getKey();
this.wordFreqs[i] = info.get(i).getValue();
}
}

// 将排序结果输出
public void printResult() {
System.out.println("共" + this.rawWords.length + "个单词，以下为该文章出现的单词及其出现次数。");
System.out.println("--------单词----次数--------");
for(int i = 0; i < this.words.length; i++)
System.out.println("-------" + this.words[i] + "----" + this.wordFreqs[i] + "--------");
}

public static void main(String[] args) {
// 测试类的功能
Article art = new Article();
art.splitWord();
art.countWordFreq();
art.sort();
art.printResult();
}
}