import operator
from django.db import models
from django.db.models import Q
from caching.base import CachingManager, CachingMixin
from msgvis.apps.base import models as base_models
from msgvis.apps.corpus import utils
import re
import os
from msgvis.settings.common import DEBUG
[docs]class Dataset(models.Model):
"""A top-level dataset object containing messages."""
name = models.CharField(max_length=150)
"""The name of the dataset"""
description = models.TextField()
"""A description of the dataset."""
created_at = models.DateTimeField(auto_now_add=True)
"""The :py:class:`datetime.datetime` when the dataset was created."""
start_time = models.DateTimeField(null=True, default=None, blank=True)
"""The time of the first real message in the dataset"""
end_time = models.DateTimeField(null=True, default=None, blank=True)
"""The time of the last real message in the dataset"""
has_prefetched_images = models.BooleanField(default=False)
@property
def message_count(self):
return self.message_set.count()
def __unicode__(self):
return self.name
[docs] def get_example_messages(self, filters=[], excludes=[]):
"""Get example messages given some filters (dictionaries containing dimensions and filter params)"""
messages = self.message_set.all()
for filter in filters:
dimension = filter["dimension"]
# Remove the dimension key
params = {key: value for key, value in filter.iteritems() if key != "dimension"}
messages = dimension.filter(messages, **params)
for exclude in excludes:
dimension = exclude["dimension"]
# Remove the dimension key
params = {key: value for key, value in excludes.iteritems() if key != "dimension"}
messages = dimension.exclude(messages, **params)
return messages
def get_example_messages_by_groups(self, groups, filters=[], excludes=[]):
include_groups = map(lambda x: int(x['value']), filter(lambda x: x['dimension'].key=='groups', filters))
if len(include_groups)> 0:
groups = include_groups
exclude_groups = map(lambda x: int(x['value']), filter(lambda x: x['dimension'].key=='groups', excludes))
groups = filter(lambda x: x not in exclude_groups, groups)
per_group = int(10 / len(groups))
combined_messages = []
group_querysets = []
for group in groups:
group_obj = self.groups.get(id=group)
messages = group_obj.messages
for filterA in filters:
dimension = filterA["dimension"]
# Remove the dimension key
params = {key: value for key, value in filterA.iteritems() if key != "dimension"}
messages = dimension.filter(messages, **params)
for exclude in excludes:
dimension = exclude["dimension"]
# Remove the dimension key
params = {key: value for key, value in excludes.iteritems() if key != "dimension"}
messages = dimension.exclude(messages, **params)
group_querysets.append(messages)
#combined_messages.extend(messages[:per_group])
query = ""
for idx, queryset in enumerate(group_querysets):
if idx > 0:
query += " UNION "
query += "(%s)" %(utils.quote(str(queryset.query)))
query = utils.convert_boolean(query)
queryset = Message.objects.raw(query)
return queryset
def get_dictionary(self):
dictionary = self.dictionary.all()
if len(dictionary) > 0:
dictionary = dictionary[0]
return dictionary
return None
def get_advanced_search_results(self, keywords_text, include_types):
clauses = keywords_text.split(',')
inclusive_keywords = []
exclusive_keywords = []
queryset = self.tweet_words.all()
message_queryset = self.message_set.all()
if (len(include_types) > 0):
message_queryset = message_queryset.filter(utils.levels_or('type__name', map(lambda x: x.name, include_types)))
final_queryset = self.message_set.none()
for clause in clauses:
if clause.startswith("NOT "):
words = clause[4:].split(' ')
word_list = utils.get_word_objs(queryset=queryset, text_field_name='original_text', related_field_name="tweet_words__id", words=words)
if len(word_list) > 0:
# TODO: makes this real AND
#and_word_list = reduce(operator.or_, word_list)
exclusive_keywords.extend(word_list)
else:
words = clause.split(' ')
word_list = utils.get_word_objs(queryset=queryset, text_field_name='original_text', related_field_name="tweet_words__id", words=words)
if len(word_list) > 0:
#and_word_list = reduce(operator.and_, word_list)
#inclusive_keywords.append(and_word_list)
clause_queryset = message_queryset
for or_word_list in word_list:
clause_queryset = clause_queryset.filter(or_word_list)
final_queryset |= clause_queryset
queryset = final_queryset
#if len(inclusive_keywords) > 0:
# inclusive_keywords = reduce(operator.or_, inclusive_keywords)
# queryset = queryset.filter(inclusive_keywords)
if len(exclusive_keywords) > 0:
for word in exclusive_keywords:
queryset = queryset.exclude(word)
return queryset.distinct()
def get_precalc_distribution(self, dimension, search_key=None, page=None, page_size=100, mode=None):
dimension_key = dimension.key
distribution = self.distributions.filter(dimension_key=dimension_key)
if search_key is not None:
distribution = distribution.filter(level__icontains=search_key)
distribution = distribution.order_by('-count')
total_num_levels = distribution.count()
if page is not None:
start = (page - 1) * page_size
end = min(start + page_size, total_num_levels)
max_page = (total_num_levels / page_size) + 1
# no level left
if total_num_levels == 0 or start > total_num_levels:
return None
distribution = distribution[start:end]
else:
if mode == "omit_others" or mode == "enable_others":
MAX_CATEGORICAL_LEVELS = 10
distribution = distribution[:MAX_CATEGORICAL_LEVELS]
else:
distribution = distribution.all()
domains = {}
domain_labels = {}
domain = map(lambda x: x.level, distribution)
labels = dimension.get_domain_labels(domain)
domains[dimension_key] = domain
domain_labels[dimension_key] = labels
table = map(lambda x: {dimension_key: x.level, "value": x.count}, distribution)
results = {
"table": table,
"domains": domains,
"domain_labels": domain_labels
}
return results
[docs]class MessageType(models.Model):
"""The type of a message, e.g. retweet, reply, original, system..."""
name = models.CharField(max_length=100, unique=True)
"""The name of the message type"""
def __unicode__(self):
return self.name
[docs]class Language(CachingMixin, models.Model):
"""Represents the language of a message or a user"""
code = models.SlugField(max_length=10, unique=True)
"""A short language code like 'en'"""
name = models.CharField(max_length=100)
"""The full name of the language"""
objects = CachingManager()
def __unicode__(self):
return "%s:%s" % (self.code, self.name)
[docs]class Url(models.Model):
"""A url from a message"""
domain = models.CharField(max_length=100, db_index=True)
"""The root domain of the url"""
short_url = models.CharField(max_length=250, blank=True)
"""A shortened url"""
full_url = models.TextField()
"""The full url"""
[docs]class Hashtag(models.Model):
"""A hashtag in a message"""
text = base_models.Utf8CharField(max_length=100, db_index=True)
"""The text of the hashtag, without the hash"""
[docs]class Timezone(CachingMixin, models.Model):
"""
The timezone of a message or user
"""
olson_code = models.CharField(max_length=40, null=True, blank=True, default=None)
"""The timezone code from pytz."""
name = models.CharField(max_length=150, db_index=True)
"""Another name for the timezone, perhaps the country where it is located?"""
objects = CachingManager()
[docs]class Person(models.Model):
"""
A person who sends messages in a dataset.
"""
class Meta:
index_together = (
('dataset', 'original_id') # used by the importer
)
dataset = models.ForeignKey(Dataset)
"""Which :class:`Dataset` this person belongs to"""
original_id = models.BigIntegerField(null=True, blank=True, default=None)
"""An external id for the person, e.g. a user id from Twitter"""
username = base_models.Utf8CharField(max_length=150, null=True, blank=True, default=None)
"""Username is a short system-y name."""
full_name = base_models.Utf8CharField(max_length=250, null=True, blank=True, default=None)
"""Full name is a longer user-friendly name"""
language = models.ForeignKey(Language, null=True, blank=True, default=None)
"""The person's primary :class:`Language`"""
message_count = models.PositiveIntegerField(blank=True, default=0)
"""The number of messages the person produced"""
replied_to_count = models.PositiveIntegerField(blank=True, default=0)
"""The number of times the person's messages were replied to"""
shared_count = models.PositiveIntegerField(blank=True, default=0)
"""The number of times the person's messages were shared or retweeted"""
mentioned_count = models.PositiveIntegerField(blank=True, default=0)
"""The number of times the person was mentioned in other people's messages"""
friend_count = models.PositiveIntegerField(blank=True, default=0)
"""The number of people this user has connected to"""
follower_count = models.PositiveIntegerField(blank=True, default=0)
"""The number of people who have connected to this person"""
profile_image_url = models.TextField(null=True, blank=True, default="")
"""The person's profile image url"""
def __unicode__(self):
return self.username
@property
def profile_image_processed_url(self):
url = self.profile_image_url
if url != "" and self.dataset.has_prefetched_images:
pattern = re.compile('/[_\.\-\w\d]+\.([\w]+)$')
results = pattern.search(url)
if results:
suffix = results.groups()[0]
url = "profile_" + str(self.original_id) + "." + suffix
return url
[docs]class Message(models.Model):
"""
The Message is the central data entity for the dataset.
"""
class Meta:
index_together = (
('dataset', 'original_id'), # used by importer
('dataset', 'time'),
)
dataset = models.ForeignKey(Dataset)
"""Which :class:`Dataset` the message belongs to"""
original_id = models.BigIntegerField(null=True, blank=True, default=None)
"""An external id for the message, e.g. a tweet id from Twitter"""
type = models.ForeignKey(MessageType, null=True, blank=True, default=None)
"""The :class:`MessageType` Message type: retweet, reply, origin..."""
sender = models.ForeignKey(Person, null=True, blank=True, default=None)
"""The :class:`Person` who sent the message"""
time = models.DateTimeField(null=True, blank=True, default=None)
"""The :py:class:`datetime.datetime` (in UTC) when the message was sent"""
language = models.ForeignKey(Language, null=True, blank=True, default=None)
"""The :class:`Language` of the message."""
SENTIMENT_POSITIVE = 1
SENTIMENT_NEUTRAL = 0
SENTIMENT_NEGATIVE = -1
SENTIMENT_CHOICES = (
(SENTIMENT_POSITIVE, "positive"),
(SENTIMENT_NEUTRAL, "neutral"),
(SENTIMENT_NEGATIVE, "negative")
)
sentiment = models.SmallIntegerField(choices=SENTIMENT_CHOICES, null=True, blank=True, default=None)
"""The sentiment label for message."""
timezone = models.ForeignKey(Timezone, null=True, blank=True, default=None)
"""The :class:`Timezone` of the message."""
replied_to_count = models.PositiveIntegerField(blank=True, default=0)
"""The number of replies this message received."""
shared_count = models.PositiveIntegerField(blank=True, default=0)
"""The number of times this message was shared or retweeted."""
contains_hashtag = models.BooleanField(blank=True, default=False)
"""True if the message has a :class:`Hashtag`."""
contains_url = models.BooleanField(blank=True, default=False)
"""True if the message has a :class:`Url`."""
contains_media = models.BooleanField(blank=True, default=False)
"""True if the message has any :class:`Media`."""
contains_mention = models.BooleanField(blank=True, default=False)
"""True if the message mentions any :class:`Person`."""
urls = models.ManyToManyField(Url, null=True, blank=True, default=None)
"""The set of :class:`Url` in the message."""
hashtags = models.ManyToManyField(Hashtag, null=True, blank=True, default=None)
"""The set of :class:`Hashtag` in the message."""
media = models.ManyToManyField(Media, null=True, blank=True, default=None)
"""The set of :class:`Media` in the message."""
mentions = models.ManyToManyField(Person, related_name="mentioned_in", null=True, blank=True, default=None)
"""The set of :class:`Person` mentioned in the message."""
text = base_models.Utf8TextField(null=True, blank=True, default="")
"""The actual text of the message."""
@property
def embedded_html(self):
#return utils.get_embedded_html(self.original_id)
return utils.render_html_tag(self.text)
@property
def media_url(self):
url = ""
if self.contains_media:
url = self.media.all()[0].media_url
if self.dataset.has_prefetched_images:
pattern = re.compile('/([_\.\-\w\d]+\.[\w]+)$')
results = pattern.search(url)
if results:
url = results.groups()[0]
return url
def __repr__(self):
return str(self.time) + " || " + self.text
def __unicode__(self):
return self.__repr__()